{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.959441102270522, "eval_steps": 500, "global_step": 30500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009703085581214826, "grad_norm": 0.5093896998931756, "learning_rate": 5e-06, "loss": 1.7158, "step": 10 }, { "epoch": 0.0019406171162429653, "grad_norm": 0.5499013987843694, "learning_rate": 1e-05, "loss": 1.6625, "step": 20 }, { "epoch": 0.002910925674364448, "grad_norm": 0.6551534985295564, "learning_rate": 1.5e-05, "loss": 1.6378, "step": 30 }, { "epoch": 0.0038812342324859306, "grad_norm": 0.7544385328936806, "learning_rate": 2e-05, "loss": 1.5712, "step": 40 }, { "epoch": 0.004851542790607413, "grad_norm": 0.6539371672140126, "learning_rate": 2.5e-05, "loss": 1.5217, "step": 50 }, { "epoch": 0.005821851348728896, "grad_norm": 0.6806376549433046, "learning_rate": 3e-05, "loss": 1.4412, "step": 60 }, { "epoch": 0.0067921599068503785, "grad_norm": 0.9109066482466388, "learning_rate": 3.5e-05, "loss": 1.2303, "step": 70 }, { "epoch": 0.007762468464971861, "grad_norm": 0.8941485474193254, "learning_rate": 4e-05, "loss": 1.1185, "step": 80 }, { "epoch": 0.008732777023093344, "grad_norm": 0.6564948122239989, "learning_rate": 4.5e-05, "loss": 1.0481, "step": 90 }, { "epoch": 0.009703085581214826, "grad_norm": 0.7749322437088195, "learning_rate": 5e-05, "loss": 0.9965, "step": 100 }, { "epoch": 0.010673394139336309, "grad_norm": 0.7305812400337627, "learning_rate": 4.998377571549095e-05, "loss": 1.0689, "step": 110 }, { "epoch": 0.011643702697457792, "grad_norm": 0.7320600025098228, "learning_rate": 4.9967551430981895e-05, "loss": 0.9554, "step": 120 }, { "epoch": 0.012614011255579274, "grad_norm": 0.7997476828820285, "learning_rate": 4.9951327146472845e-05, "loss": 0.9048, "step": 130 }, { "epoch": 0.013584319813700757, "grad_norm": 0.9942301355244718, "learning_rate": 4.993510286196379e-05, "loss": 0.9736, "step": 140 }, { "epoch": 0.01455462837182224, "grad_norm": 0.8990043753777502, "learning_rate": 4.991887857745474e-05, "loss": 0.9447, "step": 150 }, { "epoch": 0.015524936929943722, "grad_norm": 0.6931013922842434, "learning_rate": 4.990265429294568e-05, "loss": 0.8748, "step": 160 }, { "epoch": 0.016495245488065205, "grad_norm": 0.8305007689150241, "learning_rate": 4.988643000843663e-05, "loss": 0.9467, "step": 170 }, { "epoch": 0.017465554046186688, "grad_norm": 0.8993058386814992, "learning_rate": 4.9870205723927573e-05, "loss": 0.9394, "step": 180 }, { "epoch": 0.01843586260430817, "grad_norm": 0.7486900307747556, "learning_rate": 4.985398143941852e-05, "loss": 0.931, "step": 190 }, { "epoch": 0.019406171162429653, "grad_norm": 0.8879788388070488, "learning_rate": 4.983775715490947e-05, "loss": 0.9217, "step": 200 }, { "epoch": 0.020376479720551136, "grad_norm": 0.7351527409279133, "learning_rate": 4.9821532870400416e-05, "loss": 0.8438, "step": 210 }, { "epoch": 0.021346788278672618, "grad_norm": 0.9651267349255982, "learning_rate": 4.9805308585891366e-05, "loss": 0.8806, "step": 220 }, { "epoch": 0.0223170968367941, "grad_norm": 0.8358141761944308, "learning_rate": 4.978908430138231e-05, "loss": 0.9075, "step": 230 }, { "epoch": 0.023287405394915583, "grad_norm": 0.9068073418448239, "learning_rate": 4.977286001687326e-05, "loss": 0.9126, "step": 240 }, { "epoch": 0.024257713953037066, "grad_norm": 0.8510875679665427, "learning_rate": 4.97566357323642e-05, "loss": 0.8509, "step": 250 }, { "epoch": 0.02522802251115855, "grad_norm": 0.8371392324883873, "learning_rate": 4.974041144785515e-05, "loss": 0.9018, "step": 260 }, { "epoch": 0.02619833106928003, "grad_norm": 0.7343155911324605, "learning_rate": 4.9724187163346094e-05, "loss": 0.9251, "step": 270 }, { "epoch": 0.027168639627401514, "grad_norm": 1.012390858760769, "learning_rate": 4.9707962878837044e-05, "loss": 0.8881, "step": 280 }, { "epoch": 0.028138948185522997, "grad_norm": 0.8092967868905977, "learning_rate": 4.9691738594327994e-05, "loss": 0.8831, "step": 290 }, { "epoch": 0.02910925674364448, "grad_norm": 0.9545292755123069, "learning_rate": 4.967551430981894e-05, "loss": 0.8871, "step": 300 }, { "epoch": 0.030079565301765962, "grad_norm": 0.8451115258584492, "learning_rate": 4.9659290025309887e-05, "loss": 0.8913, "step": 310 }, { "epoch": 0.031049873859887445, "grad_norm": 0.984138459636415, "learning_rate": 4.964306574080083e-05, "loss": 0.9046, "step": 320 }, { "epoch": 0.03202018241800893, "grad_norm": 0.9275621393016203, "learning_rate": 4.962684145629178e-05, "loss": 0.88, "step": 330 }, { "epoch": 0.03299049097613041, "grad_norm": 1.0192818586577115, "learning_rate": 4.961061717178272e-05, "loss": 0.8923, "step": 340 }, { "epoch": 0.03396079953425189, "grad_norm": 0.9779533854801943, "learning_rate": 4.959439288727367e-05, "loss": 0.868, "step": 350 }, { "epoch": 0.034931108092373375, "grad_norm": 0.8554421373526296, "learning_rate": 4.957816860276462e-05, "loss": 0.8661, "step": 360 }, { "epoch": 0.03590141665049486, "grad_norm": 0.7404878078324254, "learning_rate": 4.9561944318255565e-05, "loss": 0.8883, "step": 370 }, { "epoch": 0.03687172520861634, "grad_norm": 0.9879847832445883, "learning_rate": 4.9545720033746515e-05, "loss": 0.8156, "step": 380 }, { "epoch": 0.03784203376673782, "grad_norm": 1.0209320862188216, "learning_rate": 4.952949574923746e-05, "loss": 0.8212, "step": 390 }, { "epoch": 0.038812342324859306, "grad_norm": 1.0823734215432599, "learning_rate": 4.951327146472841e-05, "loss": 0.8619, "step": 400 }, { "epoch": 0.03978265088298079, "grad_norm": 1.033686091190584, "learning_rate": 4.949704718021935e-05, "loss": 0.818, "step": 410 }, { "epoch": 0.04075295944110227, "grad_norm": 0.9587642487246072, "learning_rate": 4.94808228957103e-05, "loss": 0.8377, "step": 420 }, { "epoch": 0.041723267999223754, "grad_norm": 0.8072742668289667, "learning_rate": 4.946459861120124e-05, "loss": 0.8043, "step": 430 }, { "epoch": 0.042693576557345236, "grad_norm": 1.0090353092499478, "learning_rate": 4.944837432669219e-05, "loss": 0.8829, "step": 440 }, { "epoch": 0.04366388511546672, "grad_norm": 1.177538600328968, "learning_rate": 4.943215004218314e-05, "loss": 0.859, "step": 450 }, { "epoch": 0.0446341936735882, "grad_norm": 1.225063127801835, "learning_rate": 4.9415925757674086e-05, "loss": 0.8231, "step": 460 }, { "epoch": 0.045604502231709684, "grad_norm": 0.8224632705866025, "learning_rate": 4.9399701473165035e-05, "loss": 0.8335, "step": 470 }, { "epoch": 0.04657481078983117, "grad_norm": 1.153934818941284, "learning_rate": 4.938347718865598e-05, "loss": 0.8046, "step": 480 }, { "epoch": 0.04754511934795265, "grad_norm": 1.0954567838682858, "learning_rate": 4.936725290414693e-05, "loss": 0.8975, "step": 490 }, { "epoch": 0.04851542790607413, "grad_norm": 1.2466392758405571, "learning_rate": 4.935102861963788e-05, "loss": 0.8361, "step": 500 }, { "epoch": 0.049485736464195615, "grad_norm": 0.902356430744448, "learning_rate": 4.933480433512883e-05, "loss": 0.8048, "step": 510 }, { "epoch": 0.0504560450223171, "grad_norm": 1.01027561407788, "learning_rate": 4.931858005061977e-05, "loss": 0.8094, "step": 520 }, { "epoch": 0.05142635358043858, "grad_norm": 1.1330239054084805, "learning_rate": 4.930235576611072e-05, "loss": 0.8288, "step": 530 }, { "epoch": 0.05239666213856006, "grad_norm": 1.19235957479236, "learning_rate": 4.9286131481601664e-05, "loss": 0.8358, "step": 540 }, { "epoch": 0.053366970696681545, "grad_norm": 1.070882879754836, "learning_rate": 4.926990719709261e-05, "loss": 0.783, "step": 550 }, { "epoch": 0.05433727925480303, "grad_norm": 0.772829083325615, "learning_rate": 4.925368291258356e-05, "loss": 0.7888, "step": 560 }, { "epoch": 0.05530758781292451, "grad_norm": 1.0485603174488358, "learning_rate": 4.9237458628074506e-05, "loss": 0.8893, "step": 570 }, { "epoch": 0.05627789637104599, "grad_norm": 1.1365438528959075, "learning_rate": 4.9221234343565456e-05, "loss": 0.8533, "step": 580 }, { "epoch": 0.057248204929167476, "grad_norm": 1.1791401079390134, "learning_rate": 4.92050100590564e-05, "loss": 0.8208, "step": 590 }, { "epoch": 0.05821851348728896, "grad_norm": 1.165054586063552, "learning_rate": 4.918878577454735e-05, "loss": 0.7945, "step": 600 }, { "epoch": 0.05918882204541044, "grad_norm": 1.0001362775159148, "learning_rate": 4.917256149003829e-05, "loss": 0.8426, "step": 610 }, { "epoch": 0.060159130603531924, "grad_norm": 1.1955538077863535, "learning_rate": 4.915633720552924e-05, "loss": 0.8346, "step": 620 }, { "epoch": 0.06112943916165341, "grad_norm": 1.016839980469479, "learning_rate": 4.9140112921020184e-05, "loss": 0.8224, "step": 630 }, { "epoch": 0.06209974771977489, "grad_norm": 0.9782695489269584, "learning_rate": 4.9123888636511134e-05, "loss": 0.849, "step": 640 }, { "epoch": 0.06307005627789637, "grad_norm": 1.0768176346339298, "learning_rate": 4.9107664352002084e-05, "loss": 0.8519, "step": 650 }, { "epoch": 0.06404036483601785, "grad_norm": 1.0636061081219466, "learning_rate": 4.909144006749303e-05, "loss": 0.7887, "step": 660 }, { "epoch": 0.06501067339413934, "grad_norm": 1.0734895912783853, "learning_rate": 4.907521578298398e-05, "loss": 0.8415, "step": 670 }, { "epoch": 0.06598098195226082, "grad_norm": 1.0796531470403106, "learning_rate": 4.905899149847492e-05, "loss": 0.7512, "step": 680 }, { "epoch": 0.0669512905103823, "grad_norm": 0.9361767973637918, "learning_rate": 4.904276721396587e-05, "loss": 0.8482, "step": 690 }, { "epoch": 0.06792159906850379, "grad_norm": 1.5287022172498188, "learning_rate": 4.902654292945681e-05, "loss": 0.8389, "step": 700 }, { "epoch": 0.06889190762662527, "grad_norm": 1.129894676066631, "learning_rate": 4.901031864494776e-05, "loss": 0.7629, "step": 710 }, { "epoch": 0.06986221618474675, "grad_norm": 1.060923634010241, "learning_rate": 4.8994094360438705e-05, "loss": 0.8134, "step": 720 }, { "epoch": 0.07083252474286823, "grad_norm": 1.121507522572716, "learning_rate": 4.8977870075929655e-05, "loss": 0.8295, "step": 730 }, { "epoch": 0.07180283330098972, "grad_norm": 1.2375518463265478, "learning_rate": 4.8961645791420605e-05, "loss": 0.8006, "step": 740 }, { "epoch": 0.0727731418591112, "grad_norm": 1.1957590053535605, "learning_rate": 4.894542150691155e-05, "loss": 0.8185, "step": 750 }, { "epoch": 0.07374345041723268, "grad_norm": 1.1525450079069435, "learning_rate": 4.89291972224025e-05, "loss": 0.8186, "step": 760 }, { "epoch": 0.07471375897535416, "grad_norm": 1.2637346291101144, "learning_rate": 4.891297293789344e-05, "loss": 0.8094, "step": 770 }, { "epoch": 0.07568406753347565, "grad_norm": 1.1118603684784645, "learning_rate": 4.889674865338439e-05, "loss": 0.8075, "step": 780 }, { "epoch": 0.07665437609159713, "grad_norm": 0.987279065085845, "learning_rate": 4.888052436887533e-05, "loss": 0.8088, "step": 790 }, { "epoch": 0.07762468464971861, "grad_norm": 1.050050445685608, "learning_rate": 4.886430008436628e-05, "loss": 0.8283, "step": 800 }, { "epoch": 0.0785949932078401, "grad_norm": 1.1064553828881234, "learning_rate": 4.8848075799857226e-05, "loss": 0.7906, "step": 810 }, { "epoch": 0.07956530176596158, "grad_norm": 1.0679283923210974, "learning_rate": 4.8831851515348176e-05, "loss": 0.7888, "step": 820 }, { "epoch": 0.08053561032408306, "grad_norm": 1.325000406776113, "learning_rate": 4.8815627230839126e-05, "loss": 0.8573, "step": 830 }, { "epoch": 0.08150591888220454, "grad_norm": 1.1430990025005974, "learning_rate": 4.879940294633007e-05, "loss": 0.823, "step": 840 }, { "epoch": 0.08247622744032602, "grad_norm": 0.9708573314011439, "learning_rate": 4.878317866182102e-05, "loss": 0.7812, "step": 850 }, { "epoch": 0.08344653599844751, "grad_norm": 1.040516439704035, "learning_rate": 4.876695437731196e-05, "loss": 0.8043, "step": 860 }, { "epoch": 0.08441684455656899, "grad_norm": 1.4624875951419556, "learning_rate": 4.875073009280291e-05, "loss": 0.8054, "step": 870 }, { "epoch": 0.08538715311469047, "grad_norm": 1.2973550173157966, "learning_rate": 4.8734505808293854e-05, "loss": 0.7693, "step": 880 }, { "epoch": 0.08635746167281196, "grad_norm": 1.21645485433679, "learning_rate": 4.8718281523784804e-05, "loss": 0.7402, "step": 890 }, { "epoch": 0.08732777023093344, "grad_norm": 1.2989905455220712, "learning_rate": 4.870205723927575e-05, "loss": 0.7603, "step": 900 }, { "epoch": 0.08829807878905492, "grad_norm": 1.1647056182819275, "learning_rate": 4.86858329547667e-05, "loss": 0.7576, "step": 910 }, { "epoch": 0.0892683873471764, "grad_norm": 1.32821951027441, "learning_rate": 4.8669608670257646e-05, "loss": 0.7593, "step": 920 }, { "epoch": 0.09023869590529789, "grad_norm": 0.9792995911846096, "learning_rate": 4.865338438574859e-05, "loss": 0.7726, "step": 930 }, { "epoch": 0.09120900446341937, "grad_norm": 1.1716259071546666, "learning_rate": 4.863716010123954e-05, "loss": 0.7585, "step": 940 }, { "epoch": 0.09217931302154085, "grad_norm": 1.4206264005961533, "learning_rate": 4.862093581673048e-05, "loss": 0.7922, "step": 950 }, { "epoch": 0.09314962157966233, "grad_norm": 0.9969780924285648, "learning_rate": 4.860471153222143e-05, "loss": 0.8338, "step": 960 }, { "epoch": 0.09411993013778382, "grad_norm": 1.2259249945683814, "learning_rate": 4.8588487247712375e-05, "loss": 0.7848, "step": 970 }, { "epoch": 0.0950902386959053, "grad_norm": 1.304284918297249, "learning_rate": 4.8572262963203325e-05, "loss": 0.799, "step": 980 }, { "epoch": 0.09606054725402678, "grad_norm": 1.382632512351389, "learning_rate": 4.8556038678694274e-05, "loss": 0.7358, "step": 990 }, { "epoch": 0.09703085581214826, "grad_norm": 1.1047797334502243, "learning_rate": 4.853981439418522e-05, "loss": 0.7175, "step": 1000 }, { "epoch": 0.09800116437026975, "grad_norm": 1.298146739590951, "learning_rate": 4.852359010967617e-05, "loss": 0.7694, "step": 1010 }, { "epoch": 0.09897147292839123, "grad_norm": 1.3449159574026, "learning_rate": 4.850736582516711e-05, "loss": 0.7549, "step": 1020 }, { "epoch": 0.09994178148651271, "grad_norm": 1.0510958795717098, "learning_rate": 4.849114154065806e-05, "loss": 0.7851, "step": 1030 }, { "epoch": 0.1009120900446342, "grad_norm": 1.2932499353997113, "learning_rate": 4.8474917256149e-05, "loss": 0.7948, "step": 1040 }, { "epoch": 0.10188239860275568, "grad_norm": 1.024772482994231, "learning_rate": 4.845869297163995e-05, "loss": 0.7551, "step": 1050 }, { "epoch": 0.10285270716087716, "grad_norm": 0.9151226850367016, "learning_rate": 4.8442468687130896e-05, "loss": 0.8212, "step": 1060 }, { "epoch": 0.10382301571899864, "grad_norm": 1.0356064137085648, "learning_rate": 4.8426244402621846e-05, "loss": 0.7815, "step": 1070 }, { "epoch": 0.10479332427712013, "grad_norm": 1.0454220890712578, "learning_rate": 4.8410020118112795e-05, "loss": 0.7813, "step": 1080 }, { "epoch": 0.10576363283524161, "grad_norm": 1.0245673186100301, "learning_rate": 4.839379583360374e-05, "loss": 0.7759, "step": 1090 }, { "epoch": 0.10673394139336309, "grad_norm": 1.1805883290044246, "learning_rate": 4.837757154909469e-05, "loss": 0.8016, "step": 1100 }, { "epoch": 0.10770424995148457, "grad_norm": 1.305171444801399, "learning_rate": 4.836134726458563e-05, "loss": 0.7603, "step": 1110 }, { "epoch": 0.10867455850960606, "grad_norm": 1.1334982322569604, "learning_rate": 4.834512298007658e-05, "loss": 0.7818, "step": 1120 }, { "epoch": 0.10964486706772754, "grad_norm": 1.4897469886835581, "learning_rate": 4.8328898695567524e-05, "loss": 0.7391, "step": 1130 }, { "epoch": 0.11061517562584902, "grad_norm": 1.12299562877673, "learning_rate": 4.8312674411058474e-05, "loss": 0.8129, "step": 1140 }, { "epoch": 0.1115854841839705, "grad_norm": 1.2581064725802191, "learning_rate": 4.8296450126549417e-05, "loss": 0.7132, "step": 1150 }, { "epoch": 0.11255579274209199, "grad_norm": 1.1117033056933057, "learning_rate": 4.8280225842040366e-05, "loss": 0.7247, "step": 1160 }, { "epoch": 0.11352610130021347, "grad_norm": 1.5072697151276053, "learning_rate": 4.8264001557531316e-05, "loss": 0.7832, "step": 1170 }, { "epoch": 0.11449640985833495, "grad_norm": 1.3629201153146466, "learning_rate": 4.824777727302226e-05, "loss": 0.7894, "step": 1180 }, { "epoch": 0.11546671841645643, "grad_norm": 1.3456355263382838, "learning_rate": 4.823155298851321e-05, "loss": 0.8145, "step": 1190 }, { "epoch": 0.11643702697457792, "grad_norm": 1.3206439343547949, "learning_rate": 4.821532870400415e-05, "loss": 0.7802, "step": 1200 }, { "epoch": 0.1174073355326994, "grad_norm": 1.0980976013487813, "learning_rate": 4.81991044194951e-05, "loss": 0.801, "step": 1210 }, { "epoch": 0.11837764409082088, "grad_norm": 1.1370443655089153, "learning_rate": 4.8182880134986045e-05, "loss": 0.8006, "step": 1220 }, { "epoch": 0.11934795264894237, "grad_norm": 1.4354316240483984, "learning_rate": 4.8166655850476994e-05, "loss": 0.8131, "step": 1230 }, { "epoch": 0.12031826120706385, "grad_norm": 0.9112243461310537, "learning_rate": 4.815043156596794e-05, "loss": 0.8145, "step": 1240 }, { "epoch": 0.12128856976518533, "grad_norm": 0.8213091259360801, "learning_rate": 4.813420728145889e-05, "loss": 0.7114, "step": 1250 }, { "epoch": 0.12225887832330681, "grad_norm": 1.3405078182982422, "learning_rate": 4.811798299694984e-05, "loss": 0.7986, "step": 1260 }, { "epoch": 0.1232291868814283, "grad_norm": 1.4540711324279514, "learning_rate": 4.810175871244078e-05, "loss": 0.7281, "step": 1270 }, { "epoch": 0.12419949543954978, "grad_norm": 1.0040222075537582, "learning_rate": 4.8085534427931737e-05, "loss": 0.8048, "step": 1280 }, { "epoch": 0.12516980399767125, "grad_norm": 1.9264161003194282, "learning_rate": 4.806931014342268e-05, "loss": 0.7512, "step": 1290 }, { "epoch": 0.12614011255579274, "grad_norm": 1.155430871835739, "learning_rate": 4.805308585891363e-05, "loss": 0.7909, "step": 1300 }, { "epoch": 0.1271104211139142, "grad_norm": 1.382153914724162, "learning_rate": 4.803686157440457e-05, "loss": 0.7613, "step": 1310 }, { "epoch": 0.1280807296720357, "grad_norm": 1.1079273142955168, "learning_rate": 4.802063728989552e-05, "loss": 0.7747, "step": 1320 }, { "epoch": 0.12905103823015718, "grad_norm": 1.4797892743755068, "learning_rate": 4.8004413005386465e-05, "loss": 0.7448, "step": 1330 }, { "epoch": 0.13002134678827867, "grad_norm": 1.2840858833683126, "learning_rate": 4.7988188720877415e-05, "loss": 0.7567, "step": 1340 }, { "epoch": 0.13099165534640014, "grad_norm": 1.7212676971427285, "learning_rate": 4.797196443636836e-05, "loss": 0.7743, "step": 1350 }, { "epoch": 0.13196196390452164, "grad_norm": 1.7283159526025742, "learning_rate": 4.795574015185931e-05, "loss": 0.7696, "step": 1360 }, { "epoch": 0.1329322724626431, "grad_norm": 1.0739621496825589, "learning_rate": 4.793951586735026e-05, "loss": 0.7408, "step": 1370 }, { "epoch": 0.1339025810207646, "grad_norm": 1.183199109006057, "learning_rate": 4.79232915828412e-05, "loss": 0.7935, "step": 1380 }, { "epoch": 0.13487288957888607, "grad_norm": 1.3981562096537532, "learning_rate": 4.790706729833215e-05, "loss": 0.7113, "step": 1390 }, { "epoch": 0.13584319813700757, "grad_norm": 1.192516841319882, "learning_rate": 4.789084301382309e-05, "loss": 0.7786, "step": 1400 }, { "epoch": 0.13681350669512904, "grad_norm": 1.9269898255758637, "learning_rate": 4.787461872931404e-05, "loss": 0.7262, "step": 1410 }, { "epoch": 0.13778381525325054, "grad_norm": 1.1218056549529871, "learning_rate": 4.7858394444804986e-05, "loss": 0.6762, "step": 1420 }, { "epoch": 0.138754123811372, "grad_norm": 1.3635607352483248, "learning_rate": 4.7842170160295936e-05, "loss": 0.7733, "step": 1430 }, { "epoch": 0.1397244323694935, "grad_norm": 1.2380674478503626, "learning_rate": 4.782594587578688e-05, "loss": 0.7599, "step": 1440 }, { "epoch": 0.14069474092761497, "grad_norm": 1.4869366640536255, "learning_rate": 4.780972159127783e-05, "loss": 0.7408, "step": 1450 }, { "epoch": 0.14166504948573647, "grad_norm": 1.413887084722376, "learning_rate": 4.779349730676878e-05, "loss": 0.7792, "step": 1460 }, { "epoch": 0.14263535804385793, "grad_norm": 1.1522475316568597, "learning_rate": 4.777727302225972e-05, "loss": 0.7494, "step": 1470 }, { "epoch": 0.14360566660197943, "grad_norm": 1.5458892686752617, "learning_rate": 4.776104873775067e-05, "loss": 0.7151, "step": 1480 }, { "epoch": 0.1445759751601009, "grad_norm": 1.8892150029961168, "learning_rate": 4.7744824453241614e-05, "loss": 0.7025, "step": 1490 }, { "epoch": 0.1455462837182224, "grad_norm": 1.5206314987834826, "learning_rate": 4.7728600168732564e-05, "loss": 0.7748, "step": 1500 }, { "epoch": 0.14651659227634387, "grad_norm": 1.2234736643633124, "learning_rate": 4.771237588422351e-05, "loss": 0.7321, "step": 1510 }, { "epoch": 0.14748690083446536, "grad_norm": 1.3582944219822406, "learning_rate": 4.7696151599714456e-05, "loss": 0.7524, "step": 1520 }, { "epoch": 0.14845720939258683, "grad_norm": 1.3505468356868415, "learning_rate": 4.76799273152054e-05, "loss": 0.8032, "step": 1530 }, { "epoch": 0.14942751795070833, "grad_norm": 1.161565772908295, "learning_rate": 4.766370303069635e-05, "loss": 0.7449, "step": 1540 }, { "epoch": 0.1503978265088298, "grad_norm": 1.5018575379494306, "learning_rate": 4.76474787461873e-05, "loss": 0.779, "step": 1550 }, { "epoch": 0.1513681350669513, "grad_norm": 1.2744144878497465, "learning_rate": 4.763125446167824e-05, "loss": 0.7396, "step": 1560 }, { "epoch": 0.15233844362507276, "grad_norm": 1.2326350258332728, "learning_rate": 4.761503017716919e-05, "loss": 0.7249, "step": 1570 }, { "epoch": 0.15330875218319426, "grad_norm": 1.2982918445496714, "learning_rate": 4.7598805892660135e-05, "loss": 0.7453, "step": 1580 }, { "epoch": 0.15427906074131573, "grad_norm": 1.4792372825641715, "learning_rate": 4.7582581608151085e-05, "loss": 0.7573, "step": 1590 }, { "epoch": 0.15524936929943722, "grad_norm": 1.2935035055138504, "learning_rate": 4.756635732364203e-05, "loss": 0.7125, "step": 1600 }, { "epoch": 0.1562196778575587, "grad_norm": 1.2527694639896991, "learning_rate": 4.755013303913298e-05, "loss": 0.7406, "step": 1610 }, { "epoch": 0.1571899864156802, "grad_norm": 1.0724672098454868, "learning_rate": 4.753390875462392e-05, "loss": 0.7108, "step": 1620 }, { "epoch": 0.15816029497380166, "grad_norm": 1.1696947872465324, "learning_rate": 4.751768447011487e-05, "loss": 0.7636, "step": 1630 }, { "epoch": 0.15913060353192315, "grad_norm": 1.216857763890884, "learning_rate": 4.750146018560582e-05, "loss": 0.7916, "step": 1640 }, { "epoch": 0.16010091209004462, "grad_norm": 1.1371281502973842, "learning_rate": 4.748523590109676e-05, "loss": 0.7663, "step": 1650 }, { "epoch": 0.16107122064816612, "grad_norm": 1.2599433230373354, "learning_rate": 4.746901161658771e-05, "loss": 0.7279, "step": 1660 }, { "epoch": 0.1620415292062876, "grad_norm": 1.4580521921521419, "learning_rate": 4.7452787332078656e-05, "loss": 0.7241, "step": 1670 }, { "epoch": 0.16301183776440908, "grad_norm": 1.2227652156436173, "learning_rate": 4.7436563047569605e-05, "loss": 0.7082, "step": 1680 }, { "epoch": 0.16398214632253055, "grad_norm": 1.3578429437561153, "learning_rate": 4.742033876306055e-05, "loss": 0.6972, "step": 1690 }, { "epoch": 0.16495245488065205, "grad_norm": 1.2809095001474842, "learning_rate": 4.74041144785515e-05, "loss": 0.7267, "step": 1700 }, { "epoch": 0.16592276343877352, "grad_norm": 1.1017366555347645, "learning_rate": 4.738789019404245e-05, "loss": 0.7243, "step": 1710 }, { "epoch": 0.16689307199689501, "grad_norm": 1.2852463688715783, "learning_rate": 4.737166590953339e-05, "loss": 0.7276, "step": 1720 }, { "epoch": 0.16786338055501648, "grad_norm": 1.3099425645156408, "learning_rate": 4.735544162502434e-05, "loss": 0.7583, "step": 1730 }, { "epoch": 0.16883368911313798, "grad_norm": 1.5220953005112245, "learning_rate": 4.7339217340515284e-05, "loss": 0.7651, "step": 1740 }, { "epoch": 0.16980399767125945, "grad_norm": 1.3251019142596, "learning_rate": 4.7322993056006233e-05, "loss": 0.7428, "step": 1750 }, { "epoch": 0.17077430622938095, "grad_norm": 1.3275994643711895, "learning_rate": 4.7306768771497176e-05, "loss": 0.7552, "step": 1760 }, { "epoch": 0.17174461478750241, "grad_norm": 1.1363294732621385, "learning_rate": 4.7290544486988126e-05, "loss": 0.7441, "step": 1770 }, { "epoch": 0.1727149233456239, "grad_norm": 1.6604556501118164, "learning_rate": 4.727432020247907e-05, "loss": 0.7404, "step": 1780 }, { "epoch": 0.17368523190374538, "grad_norm": 1.4971063178125654, "learning_rate": 4.725809591797002e-05, "loss": 0.707, "step": 1790 }, { "epoch": 0.17465554046186688, "grad_norm": 1.296038406932857, "learning_rate": 4.724187163346097e-05, "loss": 0.7226, "step": 1800 }, { "epoch": 0.17562584901998834, "grad_norm": 0.9900558600646284, "learning_rate": 4.722564734895191e-05, "loss": 0.7107, "step": 1810 }, { "epoch": 0.17659615757810984, "grad_norm": 0.9589095684989648, "learning_rate": 4.720942306444286e-05, "loss": 0.6789, "step": 1820 }, { "epoch": 0.1775664661362313, "grad_norm": 1.300487413115222, "learning_rate": 4.7193198779933805e-05, "loss": 0.7656, "step": 1830 }, { "epoch": 0.1785367746943528, "grad_norm": 1.4678054839136885, "learning_rate": 4.7176974495424754e-05, "loss": 0.7559, "step": 1840 }, { "epoch": 0.17950708325247428, "grad_norm": 1.1487384302132937, "learning_rate": 4.71607502109157e-05, "loss": 0.7218, "step": 1850 }, { "epoch": 0.18047739181059577, "grad_norm": 1.5013237178369594, "learning_rate": 4.714452592640665e-05, "loss": 0.6618, "step": 1860 }, { "epoch": 0.18144770036871724, "grad_norm": 1.3544109774626758, "learning_rate": 4.712830164189759e-05, "loss": 0.7348, "step": 1870 }, { "epoch": 0.18241800892683874, "grad_norm": 1.7098997535129123, "learning_rate": 4.711207735738854e-05, "loss": 0.6721, "step": 1880 }, { "epoch": 0.1833883174849602, "grad_norm": 1.439742442692076, "learning_rate": 4.709585307287949e-05, "loss": 0.6823, "step": 1890 }, { "epoch": 0.1843586260430817, "grad_norm": 1.1240799236385792, "learning_rate": 4.707962878837043e-05, "loss": 0.7337, "step": 1900 }, { "epoch": 0.18532893460120317, "grad_norm": 1.611378134043144, "learning_rate": 4.706340450386138e-05, "loss": 0.7348, "step": 1910 }, { "epoch": 0.18629924315932467, "grad_norm": 1.15451740477432, "learning_rate": 4.7047180219352325e-05, "loss": 0.6945, "step": 1920 }, { "epoch": 0.18726955171744614, "grad_norm": 1.5743486470192276, "learning_rate": 4.7030955934843275e-05, "loss": 0.8062, "step": 1930 }, { "epoch": 0.18823986027556763, "grad_norm": 1.0248040084556684, "learning_rate": 4.701473165033422e-05, "loss": 0.7155, "step": 1940 }, { "epoch": 0.1892101688336891, "grad_norm": 1.0310414805904136, "learning_rate": 4.699850736582517e-05, "loss": 0.6776, "step": 1950 }, { "epoch": 0.1901804773918106, "grad_norm": 1.0785156193356158, "learning_rate": 4.698228308131611e-05, "loss": 0.7092, "step": 1960 }, { "epoch": 0.19115078594993207, "grad_norm": 1.4378941602091937, "learning_rate": 4.696605879680706e-05, "loss": 0.7286, "step": 1970 }, { "epoch": 0.19212109450805356, "grad_norm": 1.1595135486156671, "learning_rate": 4.694983451229801e-05, "loss": 0.7479, "step": 1980 }, { "epoch": 0.19309140306617503, "grad_norm": 1.69348514333408, "learning_rate": 4.6933610227788953e-05, "loss": 0.7263, "step": 1990 }, { "epoch": 0.19406171162429653, "grad_norm": 1.502953634133657, "learning_rate": 4.69173859432799e-05, "loss": 0.7655, "step": 2000 }, { "epoch": 0.19406171162429653, "eval_loss": 0.7587813138961792, "eval_runtime": 2477.8973, "eval_samples_per_second": 0.723, "eval_steps_per_second": 0.362, "step": 2000 }, { "epoch": 0.195032020182418, "grad_norm": 1.4070334559445785, "learning_rate": 4.6901161658770846e-05, "loss": 0.7475, "step": 2010 }, { "epoch": 0.1960023287405395, "grad_norm": 0.8673128363877267, "learning_rate": 4.6884937374261796e-05, "loss": 0.6747, "step": 2020 }, { "epoch": 0.19697263729866096, "grad_norm": 1.428849127809278, "learning_rate": 4.686871308975274e-05, "loss": 0.7666, "step": 2030 }, { "epoch": 0.19794294585678246, "grad_norm": 1.2749540514563555, "learning_rate": 4.685248880524369e-05, "loss": 0.7346, "step": 2040 }, { "epoch": 0.19891325441490393, "grad_norm": 1.2400883720583105, "learning_rate": 4.683626452073464e-05, "loss": 0.7012, "step": 2050 }, { "epoch": 0.19988356297302542, "grad_norm": 1.4035260703119048, "learning_rate": 4.682004023622559e-05, "loss": 0.6697, "step": 2060 }, { "epoch": 0.2008538715311469, "grad_norm": 1.9046063428505313, "learning_rate": 4.680381595171653e-05, "loss": 0.7126, "step": 2070 }, { "epoch": 0.2018241800892684, "grad_norm": 1.3281602578294986, "learning_rate": 4.678759166720748e-05, "loss": 0.6895, "step": 2080 }, { "epoch": 0.20279448864738986, "grad_norm": 1.271644663424638, "learning_rate": 4.677136738269843e-05, "loss": 0.7354, "step": 2090 }, { "epoch": 0.20376479720551136, "grad_norm": 1.2206696245686643, "learning_rate": 4.6755143098189374e-05, "loss": 0.7252, "step": 2100 }, { "epoch": 0.20473510576363282, "grad_norm": 1.3032653564716208, "learning_rate": 4.6738918813680324e-05, "loss": 0.683, "step": 2110 }, { "epoch": 0.20570541432175432, "grad_norm": 1.3212954807490243, "learning_rate": 4.6722694529171267e-05, "loss": 0.737, "step": 2120 }, { "epoch": 0.2066757228798758, "grad_norm": 1.2586128849417775, "learning_rate": 4.6706470244662216e-05, "loss": 0.6841, "step": 2130 }, { "epoch": 0.20764603143799729, "grad_norm": 1.3491929764497603, "learning_rate": 4.669024596015316e-05, "loss": 0.7637, "step": 2140 }, { "epoch": 0.20861633999611875, "grad_norm": 1.5081978528458062, "learning_rate": 4.667402167564411e-05, "loss": 0.713, "step": 2150 }, { "epoch": 0.20958664855424025, "grad_norm": 1.5788893940918114, "learning_rate": 4.665779739113505e-05, "loss": 0.6337, "step": 2160 }, { "epoch": 0.21055695711236172, "grad_norm": 1.422646038947752, "learning_rate": 4.6641573106626e-05, "loss": 0.6653, "step": 2170 }, { "epoch": 0.21152726567048322, "grad_norm": 1.3417475771584986, "learning_rate": 4.662534882211695e-05, "loss": 0.7246, "step": 2180 }, { "epoch": 0.21249757422860469, "grad_norm": 1.0925391942237144, "learning_rate": 4.6609124537607895e-05, "loss": 0.7228, "step": 2190 }, { "epoch": 0.21346788278672618, "grad_norm": 1.8433218311262853, "learning_rate": 4.6592900253098844e-05, "loss": 0.7507, "step": 2200 }, { "epoch": 0.21443819134484765, "grad_norm": 1.792431274692439, "learning_rate": 4.657667596858979e-05, "loss": 0.6949, "step": 2210 }, { "epoch": 0.21540849990296915, "grad_norm": 1.3809251069016177, "learning_rate": 4.656045168408074e-05, "loss": 0.7252, "step": 2220 }, { "epoch": 0.21637880846109062, "grad_norm": 1.3641658734062512, "learning_rate": 4.654422739957168e-05, "loss": 0.7518, "step": 2230 }, { "epoch": 0.2173491170192121, "grad_norm": 1.452315608698053, "learning_rate": 4.652800311506263e-05, "loss": 0.6684, "step": 2240 }, { "epoch": 0.21831942557733358, "grad_norm": 1.1444356316783801, "learning_rate": 4.651177883055357e-05, "loss": 0.6714, "step": 2250 }, { "epoch": 0.21928973413545508, "grad_norm": 1.2977358748240138, "learning_rate": 4.649555454604452e-05, "loss": 0.7129, "step": 2260 }, { "epoch": 0.22026004269357655, "grad_norm": 1.328329389241565, "learning_rate": 4.647933026153547e-05, "loss": 0.6292, "step": 2270 }, { "epoch": 0.22123035125169804, "grad_norm": 1.2461104595186587, "learning_rate": 4.6463105977026415e-05, "loss": 0.7372, "step": 2280 }, { "epoch": 0.2222006598098195, "grad_norm": 1.3008437677725404, "learning_rate": 4.6446881692517365e-05, "loss": 0.6503, "step": 2290 }, { "epoch": 0.223170968367941, "grad_norm": 1.3630765232741517, "learning_rate": 4.643065740800831e-05, "loss": 0.657, "step": 2300 }, { "epoch": 0.22414127692606248, "grad_norm": 0.9600325037717949, "learning_rate": 4.641443312349926e-05, "loss": 0.7117, "step": 2310 }, { "epoch": 0.22511158548418397, "grad_norm": 1.733755330685857, "learning_rate": 4.63982088389902e-05, "loss": 0.7594, "step": 2320 }, { "epoch": 0.22608189404230544, "grad_norm": 1.722193618002385, "learning_rate": 4.638198455448115e-05, "loss": 0.6555, "step": 2330 }, { "epoch": 0.22705220260042694, "grad_norm": 1.2647254702280388, "learning_rate": 4.63657602699721e-05, "loss": 0.6996, "step": 2340 }, { "epoch": 0.2280225111585484, "grad_norm": 1.2801225623311008, "learning_rate": 4.6349535985463043e-05, "loss": 0.7038, "step": 2350 }, { "epoch": 0.2289928197166699, "grad_norm": 1.214420395864136, "learning_rate": 4.633331170095399e-05, "loss": 0.6617, "step": 2360 }, { "epoch": 0.22996312827479137, "grad_norm": 1.1970250608654163, "learning_rate": 4.6317087416444936e-05, "loss": 0.7505, "step": 2370 }, { "epoch": 0.23093343683291287, "grad_norm": 1.318541272085749, "learning_rate": 4.6300863131935886e-05, "loss": 0.703, "step": 2380 }, { "epoch": 0.23190374539103434, "grad_norm": 1.0733620278703964, "learning_rate": 4.628463884742683e-05, "loss": 0.7076, "step": 2390 }, { "epoch": 0.23287405394915583, "grad_norm": 1.320066160024292, "learning_rate": 4.626841456291778e-05, "loss": 0.6845, "step": 2400 }, { "epoch": 0.2338443625072773, "grad_norm": 1.3916335472579557, "learning_rate": 4.625219027840872e-05, "loss": 0.7213, "step": 2410 }, { "epoch": 0.2348146710653988, "grad_norm": 1.6506684734262902, "learning_rate": 4.623596599389967e-05, "loss": 0.6995, "step": 2420 }, { "epoch": 0.23578497962352027, "grad_norm": 0.8976988021024955, "learning_rate": 4.621974170939062e-05, "loss": 0.6994, "step": 2430 }, { "epoch": 0.23675528818164177, "grad_norm": 1.7783475330175254, "learning_rate": 4.6203517424881564e-05, "loss": 0.7475, "step": 2440 }, { "epoch": 0.23772559673976323, "grad_norm": 1.3329956259541478, "learning_rate": 4.6187293140372514e-05, "loss": 0.6991, "step": 2450 }, { "epoch": 0.23869590529788473, "grad_norm": 1.3282617232306233, "learning_rate": 4.617106885586346e-05, "loss": 0.7328, "step": 2460 }, { "epoch": 0.2396662138560062, "grad_norm": 1.1394548446385124, "learning_rate": 4.615484457135441e-05, "loss": 0.7092, "step": 2470 }, { "epoch": 0.2406365224141277, "grad_norm": 1.3304671581253036, "learning_rate": 4.613862028684535e-05, "loss": 0.6765, "step": 2480 }, { "epoch": 0.24160683097224916, "grad_norm": 1.5514738877871737, "learning_rate": 4.61223960023363e-05, "loss": 0.6558, "step": 2490 }, { "epoch": 0.24257713953037066, "grad_norm": 1.3668799500092241, "learning_rate": 4.610617171782724e-05, "loss": 0.7106, "step": 2500 }, { "epoch": 0.24257713953037066, "eval_loss": 0.7454198598861694, "eval_runtime": 2468.3109, "eval_samples_per_second": 0.726, "eval_steps_per_second": 0.363, "step": 2500 }, { "epoch": 0.24354744808849213, "grad_norm": 1.27051542074557, "learning_rate": 4.608994743331819e-05, "loss": 0.7037, "step": 2510 }, { "epoch": 0.24451775664661363, "grad_norm": 1.4333699367072212, "learning_rate": 4.607372314880914e-05, "loss": 0.7054, "step": 2520 }, { "epoch": 0.2454880652047351, "grad_norm": 1.3383562796673616, "learning_rate": 4.6057498864300085e-05, "loss": 0.7674, "step": 2530 }, { "epoch": 0.2464583737628566, "grad_norm": 1.5405408532446832, "learning_rate": 4.6041274579791035e-05, "loss": 0.7541, "step": 2540 }, { "epoch": 0.24742868232097806, "grad_norm": 1.4592658544748531, "learning_rate": 4.602505029528198e-05, "loss": 0.7286, "step": 2550 }, { "epoch": 0.24839899087909956, "grad_norm": 1.4060719508768202, "learning_rate": 4.600882601077293e-05, "loss": 0.7722, "step": 2560 }, { "epoch": 0.24936929943722103, "grad_norm": 1.2688352607364943, "learning_rate": 4.599260172626387e-05, "loss": 0.6851, "step": 2570 }, { "epoch": 0.2503396079953425, "grad_norm": 1.1798367463314652, "learning_rate": 4.597637744175482e-05, "loss": 0.6897, "step": 2580 }, { "epoch": 0.251309916553464, "grad_norm": 1.4210256244438406, "learning_rate": 4.5960153157245763e-05, "loss": 0.7005, "step": 2590 }, { "epoch": 0.2522802251115855, "grad_norm": 1.3304574917634664, "learning_rate": 4.594392887273671e-05, "loss": 0.6878, "step": 2600 }, { "epoch": 0.25325053366970696, "grad_norm": 1.4281359203156077, "learning_rate": 4.592770458822766e-05, "loss": 0.7519, "step": 2610 }, { "epoch": 0.2542208422278284, "grad_norm": 1.351420571440429, "learning_rate": 4.5911480303718606e-05, "loss": 0.6939, "step": 2620 }, { "epoch": 0.25519115078594995, "grad_norm": 1.3618970623647955, "learning_rate": 4.5895256019209556e-05, "loss": 0.669, "step": 2630 }, { "epoch": 0.2561614593440714, "grad_norm": 1.1008259683717303, "learning_rate": 4.58790317347005e-05, "loss": 0.6331, "step": 2640 }, { "epoch": 0.2571317679021929, "grad_norm": 1.690823489066079, "learning_rate": 4.586280745019145e-05, "loss": 0.6571, "step": 2650 }, { "epoch": 0.25810207646031436, "grad_norm": 1.2328191346656623, "learning_rate": 4.584658316568239e-05, "loss": 0.6712, "step": 2660 }, { "epoch": 0.2590723850184359, "grad_norm": 1.1997509506925832, "learning_rate": 4.583035888117334e-05, "loss": 0.6998, "step": 2670 }, { "epoch": 0.26004269357655735, "grad_norm": 1.3726212075390893, "learning_rate": 4.5814134596664284e-05, "loss": 0.6577, "step": 2680 }, { "epoch": 0.2610130021346788, "grad_norm": 1.4778299478224584, "learning_rate": 4.5797910312155234e-05, "loss": 0.7213, "step": 2690 }, { "epoch": 0.2619833106928003, "grad_norm": 1.2065482843241282, "learning_rate": 4.5781686027646184e-05, "loss": 0.6504, "step": 2700 }, { "epoch": 0.2629536192509218, "grad_norm": 1.6950271620933635, "learning_rate": 4.576546174313713e-05, "loss": 0.6938, "step": 2710 }, { "epoch": 0.2639239278090433, "grad_norm": 1.049429219350124, "learning_rate": 4.5749237458628077e-05, "loss": 0.7235, "step": 2720 }, { "epoch": 0.26489423636716475, "grad_norm": 1.2856525880832654, "learning_rate": 4.573301317411902e-05, "loss": 0.769, "step": 2730 }, { "epoch": 0.2658645449252862, "grad_norm": 1.1413973985217811, "learning_rate": 4.571678888960997e-05, "loss": 0.6542, "step": 2740 }, { "epoch": 0.26683485348340774, "grad_norm": 1.2963652895204112, "learning_rate": 4.570056460510091e-05, "loss": 0.6988, "step": 2750 }, { "epoch": 0.2678051620415292, "grad_norm": 1.200192916715494, "learning_rate": 4.568434032059186e-05, "loss": 0.662, "step": 2760 }, { "epoch": 0.2687754705996507, "grad_norm": 1.2418786623251215, "learning_rate": 4.566811603608281e-05, "loss": 0.6615, "step": 2770 }, { "epoch": 0.26974577915777215, "grad_norm": 1.2920018198618357, "learning_rate": 4.5651891751573755e-05, "loss": 0.683, "step": 2780 }, { "epoch": 0.27071608771589367, "grad_norm": 1.721771869410589, "learning_rate": 4.5635667467064705e-05, "loss": 0.6742, "step": 2790 }, { "epoch": 0.27168639627401514, "grad_norm": 1.595428896858632, "learning_rate": 4.561944318255565e-05, "loss": 0.6961, "step": 2800 }, { "epoch": 0.2726567048321366, "grad_norm": 1.5674989639835977, "learning_rate": 4.56032188980466e-05, "loss": 0.6246, "step": 2810 }, { "epoch": 0.2736270133902581, "grad_norm": 1.333761320008167, "learning_rate": 4.558699461353754e-05, "loss": 0.7142, "step": 2820 }, { "epoch": 0.2745973219483796, "grad_norm": 1.175942959739195, "learning_rate": 4.557077032902849e-05, "loss": 0.7335, "step": 2830 }, { "epoch": 0.27556763050650107, "grad_norm": 1.0433896859463463, "learning_rate": 4.555454604451944e-05, "loss": 0.7523, "step": 2840 }, { "epoch": 0.27653793906462254, "grad_norm": 1.4484346047275096, "learning_rate": 4.553832176001039e-05, "loss": 0.6576, "step": 2850 }, { "epoch": 0.277508247622744, "grad_norm": 1.922226387784083, "learning_rate": 4.552209747550133e-05, "loss": 0.6797, "step": 2860 }, { "epoch": 0.27847855618086553, "grad_norm": 1.4433287579337053, "learning_rate": 4.550587319099228e-05, "loss": 0.6787, "step": 2870 }, { "epoch": 0.279448864738987, "grad_norm": 1.4997180997318538, "learning_rate": 4.5489648906483225e-05, "loss": 0.7091, "step": 2880 }, { "epoch": 0.28041917329710847, "grad_norm": 1.347204680869145, "learning_rate": 4.5473424621974175e-05, "loss": 0.725, "step": 2890 }, { "epoch": 0.28138948185522994, "grad_norm": 1.3046706448190106, "learning_rate": 4.5457200337465125e-05, "loss": 0.6669, "step": 2900 }, { "epoch": 0.28235979041335146, "grad_norm": 1.1467739204887912, "learning_rate": 4.544097605295607e-05, "loss": 0.756, "step": 2910 }, { "epoch": 0.28333009897147293, "grad_norm": 1.162771827035537, "learning_rate": 4.542475176844702e-05, "loss": 0.7147, "step": 2920 }, { "epoch": 0.2843004075295944, "grad_norm": 1.6563795609214405, "learning_rate": 4.540852748393796e-05, "loss": 0.6845, "step": 2930 }, { "epoch": 0.28527071608771587, "grad_norm": 1.6728193237645246, "learning_rate": 4.539230319942891e-05, "loss": 0.673, "step": 2940 }, { "epoch": 0.2862410246458374, "grad_norm": 1.194612705131068, "learning_rate": 4.5376078914919854e-05, "loss": 0.6942, "step": 2950 }, { "epoch": 0.28721133320395886, "grad_norm": 1.439958876835649, "learning_rate": 4.53598546304108e-05, "loss": 0.6809, "step": 2960 }, { "epoch": 0.28818164176208033, "grad_norm": 1.5873335965261735, "learning_rate": 4.534363034590175e-05, "loss": 0.6877, "step": 2970 }, { "epoch": 0.2891519503202018, "grad_norm": 1.0997513483559662, "learning_rate": 4.5327406061392696e-05, "loss": 0.665, "step": 2980 }, { "epoch": 0.2901222588783233, "grad_norm": 1.3952744081716912, "learning_rate": 4.5311181776883646e-05, "loss": 0.7524, "step": 2990 }, { "epoch": 0.2910925674364448, "grad_norm": 1.1613111987177211, "learning_rate": 4.529495749237459e-05, "loss": 0.6659, "step": 3000 }, { "epoch": 0.2910925674364448, "eval_loss": 0.7351760268211365, "eval_runtime": 2466.7648, "eval_samples_per_second": 0.726, "eval_steps_per_second": 0.363, "step": 3000 }, { "epoch": 0.29206287599456626, "grad_norm": 1.404418707672653, "learning_rate": 4.527873320786554e-05, "loss": 0.6635, "step": 3010 }, { "epoch": 0.29303318455268773, "grad_norm": 1.7333713648105467, "learning_rate": 4.526250892335648e-05, "loss": 0.6918, "step": 3020 }, { "epoch": 0.29400349311080926, "grad_norm": 1.325947337933012, "learning_rate": 4.524628463884743e-05, "loss": 0.708, "step": 3030 }, { "epoch": 0.2949738016689307, "grad_norm": 1.420774851884038, "learning_rate": 4.5230060354338374e-05, "loss": 0.7173, "step": 3040 }, { "epoch": 0.2959441102270522, "grad_norm": 1.1753762610267322, "learning_rate": 4.5213836069829324e-05, "loss": 0.6654, "step": 3050 }, { "epoch": 0.29691441878517366, "grad_norm": 1.526767780243654, "learning_rate": 4.5197611785320274e-05, "loss": 0.6321, "step": 3060 }, { "epoch": 0.2978847273432952, "grad_norm": 1.8280158534376985, "learning_rate": 4.518138750081122e-05, "loss": 0.6647, "step": 3070 }, { "epoch": 0.29885503590141665, "grad_norm": 1.4181228966831005, "learning_rate": 4.516516321630217e-05, "loss": 0.7338, "step": 3080 }, { "epoch": 0.2998253444595381, "grad_norm": 1.7077549961994072, "learning_rate": 4.514893893179311e-05, "loss": 0.6272, "step": 3090 }, { "epoch": 0.3007956530176596, "grad_norm": 1.518857601222952, "learning_rate": 4.513271464728406e-05, "loss": 0.6184, "step": 3100 }, { "epoch": 0.3017659615757811, "grad_norm": 1.5688964933772704, "learning_rate": 4.5116490362775e-05, "loss": 0.6904, "step": 3110 }, { "epoch": 0.3027362701339026, "grad_norm": 1.346013000056015, "learning_rate": 4.510026607826595e-05, "loss": 0.6591, "step": 3120 }, { "epoch": 0.30370657869202405, "grad_norm": 1.7289926750048026, "learning_rate": 4.5084041793756895e-05, "loss": 0.602, "step": 3130 }, { "epoch": 0.3046768872501455, "grad_norm": 1.634884511910698, "learning_rate": 4.5067817509247845e-05, "loss": 0.6344, "step": 3140 }, { "epoch": 0.30564719580826705, "grad_norm": 1.2072013322253554, "learning_rate": 4.5051593224738795e-05, "loss": 0.6441, "step": 3150 }, { "epoch": 0.3066175043663885, "grad_norm": 1.4877523069726029, "learning_rate": 4.503536894022974e-05, "loss": 0.6711, "step": 3160 }, { "epoch": 0.30758781292451, "grad_norm": 1.3820572558751547, "learning_rate": 4.501914465572069e-05, "loss": 0.7005, "step": 3170 }, { "epoch": 0.30855812148263145, "grad_norm": 1.4545570501775118, "learning_rate": 4.500292037121163e-05, "loss": 0.6796, "step": 3180 }, { "epoch": 0.309528430040753, "grad_norm": 1.4415846087886384, "learning_rate": 4.498669608670258e-05, "loss": 0.6753, "step": 3190 }, { "epoch": 0.31049873859887445, "grad_norm": 1.253477394104618, "learning_rate": 4.497047180219352e-05, "loss": 0.745, "step": 3200 }, { "epoch": 0.3114690471569959, "grad_norm": 1.6938495814472803, "learning_rate": 4.495424751768447e-05, "loss": 0.6656, "step": 3210 }, { "epoch": 0.3124393557151174, "grad_norm": 1.7041386856543572, "learning_rate": 4.4938023233175416e-05, "loss": 0.678, "step": 3220 }, { "epoch": 0.3134096642732389, "grad_norm": 1.945535890749437, "learning_rate": 4.4921798948666366e-05, "loss": 0.6741, "step": 3230 }, { "epoch": 0.3143799728313604, "grad_norm": 1.504845552046309, "learning_rate": 4.4905574664157316e-05, "loss": 0.7055, "step": 3240 }, { "epoch": 0.31535028138948185, "grad_norm": 1.5218102037928898, "learning_rate": 4.488935037964826e-05, "loss": 0.6664, "step": 3250 }, { "epoch": 0.3163205899476033, "grad_norm": 1.281958434229701, "learning_rate": 4.487312609513921e-05, "loss": 0.689, "step": 3260 }, { "epoch": 0.31729089850572484, "grad_norm": 1.4017870385811553, "learning_rate": 4.485690181063015e-05, "loss": 0.656, "step": 3270 }, { "epoch": 0.3182612070638463, "grad_norm": 1.1312164184452325, "learning_rate": 4.48406775261211e-05, "loss": 0.6956, "step": 3280 }, { "epoch": 0.3192315156219678, "grad_norm": 1.5553483087810633, "learning_rate": 4.4824453241612044e-05, "loss": 0.627, "step": 3290 }, { "epoch": 0.32020182418008925, "grad_norm": 1.5414421667820604, "learning_rate": 4.4808228957102994e-05, "loss": 0.6567, "step": 3300 }, { "epoch": 0.32117213273821077, "grad_norm": 1.3882171728699897, "learning_rate": 4.479200467259394e-05, "loss": 0.6478, "step": 3310 }, { "epoch": 0.32214244129633224, "grad_norm": 1.2870563704453035, "learning_rate": 4.477578038808489e-05, "loss": 0.6258, "step": 3320 }, { "epoch": 0.3231127498544537, "grad_norm": 1.2176939882656843, "learning_rate": 4.4759556103575836e-05, "loss": 0.6831, "step": 3330 }, { "epoch": 0.3240830584125752, "grad_norm": 1.5859704946797195, "learning_rate": 4.474333181906678e-05, "loss": 0.6875, "step": 3340 }, { "epoch": 0.3250533669706967, "grad_norm": 1.3048860685179964, "learning_rate": 4.472710753455773e-05, "loss": 0.6797, "step": 3350 }, { "epoch": 0.32602367552881817, "grad_norm": 1.4244729025826581, "learning_rate": 4.471088325004867e-05, "loss": 0.6671, "step": 3360 }, { "epoch": 0.32699398408693964, "grad_norm": 1.664993378309651, "learning_rate": 4.469465896553962e-05, "loss": 0.6946, "step": 3370 }, { "epoch": 0.3279642926450611, "grad_norm": 1.4374586076882605, "learning_rate": 4.4678434681030565e-05, "loss": 0.7152, "step": 3380 }, { "epoch": 0.32893460120318263, "grad_norm": 1.183729655148727, "learning_rate": 4.4662210396521515e-05, "loss": 0.6695, "step": 3390 }, { "epoch": 0.3299049097613041, "grad_norm": 1.3106066103858482, "learning_rate": 4.4645986112012464e-05, "loss": 0.681, "step": 3400 }, { "epoch": 0.33087521831942557, "grad_norm": 1.9834642707734547, "learning_rate": 4.462976182750341e-05, "loss": 0.6544, "step": 3410 }, { "epoch": 0.33184552687754704, "grad_norm": 1.2898353030549436, "learning_rate": 4.461353754299436e-05, "loss": 0.6516, "step": 3420 }, { "epoch": 0.33281583543566856, "grad_norm": 1.3210516235415775, "learning_rate": 4.45973132584853e-05, "loss": 0.6684, "step": 3430 }, { "epoch": 0.33378614399379003, "grad_norm": 1.4442553376708276, "learning_rate": 4.458108897397625e-05, "loss": 0.6673, "step": 3440 }, { "epoch": 0.3347564525519115, "grad_norm": 1.1500209299804536, "learning_rate": 4.456486468946719e-05, "loss": 0.7158, "step": 3450 }, { "epoch": 0.33572676111003297, "grad_norm": 1.31538694061074, "learning_rate": 4.454864040495814e-05, "loss": 0.647, "step": 3460 }, { "epoch": 0.3366970696681545, "grad_norm": 1.4081971729681526, "learning_rate": 4.4532416120449086e-05, "loss": 0.6964, "step": 3470 }, { "epoch": 0.33766737822627596, "grad_norm": 1.4794888005976117, "learning_rate": 4.4516191835940036e-05, "loss": 0.6644, "step": 3480 }, { "epoch": 0.33863768678439743, "grad_norm": 1.3524205715043236, "learning_rate": 4.4499967551430985e-05, "loss": 0.6421, "step": 3490 }, { "epoch": 0.3396079953425189, "grad_norm": 1.5105858350763188, "learning_rate": 4.448374326692193e-05, "loss": 0.7105, "step": 3500 }, { "epoch": 0.3396079953425189, "eval_loss": 0.7246462106704712, "eval_runtime": 2471.0932, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.363, "step": 3500 }, { "epoch": 0.3405783039006404, "grad_norm": 1.4583137688880468, "learning_rate": 4.446751898241288e-05, "loss": 0.6087, "step": 3510 }, { "epoch": 0.3415486124587619, "grad_norm": 1.4130179418528255, "learning_rate": 4.445129469790382e-05, "loss": 0.6628, "step": 3520 }, { "epoch": 0.34251892101688336, "grad_norm": 1.3225389415201874, "learning_rate": 4.443507041339477e-05, "loss": 0.625, "step": 3530 }, { "epoch": 0.34348922957500483, "grad_norm": 1.8587881703474012, "learning_rate": 4.4418846128885714e-05, "loss": 0.6416, "step": 3540 }, { "epoch": 0.34445953813312635, "grad_norm": 1.2742394540775415, "learning_rate": 4.4402621844376664e-05, "loss": 0.632, "step": 3550 }, { "epoch": 0.3454298466912478, "grad_norm": 1.64437983962982, "learning_rate": 4.4386397559867607e-05, "loss": 0.6383, "step": 3560 }, { "epoch": 0.3464001552493693, "grad_norm": 1.3231966805125737, "learning_rate": 4.4370173275358556e-05, "loss": 0.6841, "step": 3570 }, { "epoch": 0.34737046380749076, "grad_norm": 1.167387762307211, "learning_rate": 4.4353948990849506e-05, "loss": 0.616, "step": 3580 }, { "epoch": 0.3483407723656123, "grad_norm": 1.4336867980215322, "learning_rate": 4.433772470634045e-05, "loss": 0.6195, "step": 3590 }, { "epoch": 0.34931108092373375, "grad_norm": 1.403759520909343, "learning_rate": 4.43215004218314e-05, "loss": 0.6787, "step": 3600 }, { "epoch": 0.3502813894818552, "grad_norm": 1.4695150471417817, "learning_rate": 4.430527613732234e-05, "loss": 0.6433, "step": 3610 }, { "epoch": 0.3512516980399767, "grad_norm": 1.3322730792588582, "learning_rate": 4.42890518528133e-05, "loss": 0.6827, "step": 3620 }, { "epoch": 0.3522220065980982, "grad_norm": 1.2541946759332971, "learning_rate": 4.427282756830424e-05, "loss": 0.6767, "step": 3630 }, { "epoch": 0.3531923151562197, "grad_norm": 1.3220529304919946, "learning_rate": 4.425660328379519e-05, "loss": 0.6785, "step": 3640 }, { "epoch": 0.35416262371434115, "grad_norm": 1.8159199916167459, "learning_rate": 4.4240378999286134e-05, "loss": 0.6504, "step": 3650 }, { "epoch": 0.3551329322724626, "grad_norm": 1.5513977697025123, "learning_rate": 4.4224154714777084e-05, "loss": 0.668, "step": 3660 }, { "epoch": 0.35610324083058414, "grad_norm": 1.9383083203795937, "learning_rate": 4.420793043026803e-05, "loss": 0.6314, "step": 3670 }, { "epoch": 0.3570735493887056, "grad_norm": 1.5611559659864904, "learning_rate": 4.419170614575898e-05, "loss": 0.6617, "step": 3680 }, { "epoch": 0.3580438579468271, "grad_norm": 1.2579691148448051, "learning_rate": 4.4175481861249927e-05, "loss": 0.6641, "step": 3690 }, { "epoch": 0.35901416650494855, "grad_norm": 1.8250727183362423, "learning_rate": 4.415925757674087e-05, "loss": 0.628, "step": 3700 }, { "epoch": 0.3599844750630701, "grad_norm": 1.8663848290613063, "learning_rate": 4.414303329223182e-05, "loss": 0.6819, "step": 3710 }, { "epoch": 0.36095478362119154, "grad_norm": 0.998691604867035, "learning_rate": 4.412680900772276e-05, "loss": 0.6603, "step": 3720 }, { "epoch": 0.361925092179313, "grad_norm": 1.5186010030355703, "learning_rate": 4.411058472321371e-05, "loss": 0.6608, "step": 3730 }, { "epoch": 0.3628954007374345, "grad_norm": 1.3444105870233574, "learning_rate": 4.4094360438704655e-05, "loss": 0.6684, "step": 3740 }, { "epoch": 0.363865709295556, "grad_norm": 1.8433609494425311, "learning_rate": 4.4078136154195605e-05, "loss": 0.7007, "step": 3750 }, { "epoch": 0.3648360178536775, "grad_norm": 1.3736456316572478, "learning_rate": 4.406191186968655e-05, "loss": 0.6691, "step": 3760 }, { "epoch": 0.36580632641179894, "grad_norm": 1.3845972262001316, "learning_rate": 4.40456875851775e-05, "loss": 0.699, "step": 3770 }, { "epoch": 0.3667766349699204, "grad_norm": 1.3202439744871353, "learning_rate": 4.402946330066845e-05, "loss": 0.6463, "step": 3780 }, { "epoch": 0.36774694352804194, "grad_norm": 1.527206134833558, "learning_rate": 4.401323901615939e-05, "loss": 0.6261, "step": 3790 }, { "epoch": 0.3687172520861634, "grad_norm": 1.85165256113485, "learning_rate": 4.399701473165034e-05, "loss": 0.6268, "step": 3800 }, { "epoch": 0.3696875606442849, "grad_norm": 1.7774545505998887, "learning_rate": 4.398079044714128e-05, "loss": 0.674, "step": 3810 }, { "epoch": 0.37065786920240634, "grad_norm": 1.1302938960409563, "learning_rate": 4.396456616263223e-05, "loss": 0.6814, "step": 3820 }, { "epoch": 0.37162817776052787, "grad_norm": 1.5633030889510193, "learning_rate": 4.3948341878123176e-05, "loss": 0.6821, "step": 3830 }, { "epoch": 0.37259848631864934, "grad_norm": 1.4226623590722947, "learning_rate": 4.3932117593614126e-05, "loss": 0.6721, "step": 3840 }, { "epoch": 0.3735687948767708, "grad_norm": 1.3788997600836588, "learning_rate": 4.391589330910507e-05, "loss": 0.6484, "step": 3850 }, { "epoch": 0.3745391034348923, "grad_norm": 1.9387617060516118, "learning_rate": 4.389966902459602e-05, "loss": 0.7059, "step": 3860 }, { "epoch": 0.3755094119930138, "grad_norm": 1.4159508230128068, "learning_rate": 4.388344474008697e-05, "loss": 0.6763, "step": 3870 }, { "epoch": 0.37647972055113527, "grad_norm": 1.4395070409428377, "learning_rate": 4.386722045557791e-05, "loss": 0.6441, "step": 3880 }, { "epoch": 0.37745002910925674, "grad_norm": 1.5259821305785304, "learning_rate": 4.385099617106886e-05, "loss": 0.7342, "step": 3890 }, { "epoch": 0.3784203376673782, "grad_norm": 1.58232361711394, "learning_rate": 4.3834771886559804e-05, "loss": 0.7058, "step": 3900 }, { "epoch": 0.37939064622549973, "grad_norm": 1.2758386103028025, "learning_rate": 4.3818547602050754e-05, "loss": 0.6116, "step": 3910 }, { "epoch": 0.3803609547836212, "grad_norm": 1.354723087565567, "learning_rate": 4.38023233175417e-05, "loss": 0.6709, "step": 3920 }, { "epoch": 0.38133126334174267, "grad_norm": 1.5790529961766175, "learning_rate": 4.3786099033032646e-05, "loss": 0.7028, "step": 3930 }, { "epoch": 0.38230157189986413, "grad_norm": 1.9609521386475108, "learning_rate": 4.376987474852359e-05, "loss": 0.5906, "step": 3940 }, { "epoch": 0.38327188045798566, "grad_norm": 1.3349118770454718, "learning_rate": 4.375365046401454e-05, "loss": 0.6623, "step": 3950 }, { "epoch": 0.3842421890161071, "grad_norm": 1.2937167531350466, "learning_rate": 4.373742617950549e-05, "loss": 0.6369, "step": 3960 }, { "epoch": 0.3852124975742286, "grad_norm": 1.3419705920519438, "learning_rate": 4.372120189499643e-05, "loss": 0.7277, "step": 3970 }, { "epoch": 0.38618280613235006, "grad_norm": 1.7073814830995901, "learning_rate": 4.370497761048738e-05, "loss": 0.6241, "step": 3980 }, { "epoch": 0.3871531146904716, "grad_norm": 1.373271354057727, "learning_rate": 4.3688753325978325e-05, "loss": 0.6481, "step": 3990 }, { "epoch": 0.38812342324859306, "grad_norm": 1.7949929197599008, "learning_rate": 4.3672529041469275e-05, "loss": 0.6125, "step": 4000 }, { "epoch": 0.38812342324859306, "eval_loss": 0.7137264609336853, "eval_runtime": 2466.144, "eval_samples_per_second": 0.727, "eval_steps_per_second": 0.363, "step": 4000 }, { "epoch": 0.3890937318067145, "grad_norm": 0.8329579782452979, "learning_rate": 4.365630475696022e-05, "loss": 0.6341, "step": 4010 }, { "epoch": 0.390064040364836, "grad_norm": 1.2617089929475087, "learning_rate": 4.364008047245117e-05, "loss": 0.671, "step": 4020 }, { "epoch": 0.3910343489229575, "grad_norm": 1.5577127482655397, "learning_rate": 4.362385618794212e-05, "loss": 0.6793, "step": 4030 }, { "epoch": 0.392004657481079, "grad_norm": 1.6386238000694935, "learning_rate": 4.360763190343306e-05, "loss": 0.7014, "step": 4040 }, { "epoch": 0.39297496603920046, "grad_norm": 1.5648938337659175, "learning_rate": 4.359140761892401e-05, "loss": 0.6867, "step": 4050 }, { "epoch": 0.3939452745973219, "grad_norm": 1.6294675174253543, "learning_rate": 4.357518333441495e-05, "loss": 0.6706, "step": 4060 }, { "epoch": 0.39491558315544345, "grad_norm": 1.549167727126926, "learning_rate": 4.35589590499059e-05, "loss": 0.6536, "step": 4070 }, { "epoch": 0.3958858917135649, "grad_norm": 1.4182199112027882, "learning_rate": 4.3542734765396846e-05, "loss": 0.679, "step": 4080 }, { "epoch": 0.3968562002716864, "grad_norm": 1.2821446945633657, "learning_rate": 4.3526510480887795e-05, "loss": 0.6643, "step": 4090 }, { "epoch": 0.39782650882980786, "grad_norm": 1.6858376405816184, "learning_rate": 4.351028619637874e-05, "loss": 0.6394, "step": 4100 }, { "epoch": 0.3987968173879294, "grad_norm": 1.4610732414303427, "learning_rate": 4.349406191186969e-05, "loss": 0.7123, "step": 4110 }, { "epoch": 0.39976712594605085, "grad_norm": 1.504769761142886, "learning_rate": 4.347783762736064e-05, "loss": 0.6071, "step": 4120 }, { "epoch": 0.4007374345041723, "grad_norm": 1.6912171907525997, "learning_rate": 4.346161334285158e-05, "loss": 0.5923, "step": 4130 }, { "epoch": 0.4017077430622938, "grad_norm": 1.5783940240743402, "learning_rate": 4.344538905834253e-05, "loss": 0.5902, "step": 4140 }, { "epoch": 0.4026780516204153, "grad_norm": 1.300258602942144, "learning_rate": 4.3429164773833474e-05, "loss": 0.668, "step": 4150 }, { "epoch": 0.4036483601785368, "grad_norm": 1.3879301457682354, "learning_rate": 4.3412940489324423e-05, "loss": 0.6578, "step": 4160 }, { "epoch": 0.40461866873665825, "grad_norm": 1.0418247331643762, "learning_rate": 4.3396716204815366e-05, "loss": 0.6539, "step": 4170 }, { "epoch": 0.4055889772947797, "grad_norm": 1.817053798243336, "learning_rate": 4.3380491920306316e-05, "loss": 0.6296, "step": 4180 }, { "epoch": 0.40655928585290124, "grad_norm": 1.8784845030864437, "learning_rate": 4.336426763579726e-05, "loss": 0.6435, "step": 4190 }, { "epoch": 0.4075295944110227, "grad_norm": 1.3173177735824406, "learning_rate": 4.334804335128821e-05, "loss": 0.6467, "step": 4200 }, { "epoch": 0.4084999029691442, "grad_norm": 1.6609920860725593, "learning_rate": 4.333181906677916e-05, "loss": 0.6506, "step": 4210 }, { "epoch": 0.40947021152726565, "grad_norm": 1.3422418231507067, "learning_rate": 4.33155947822701e-05, "loss": 0.6794, "step": 4220 }, { "epoch": 0.4104405200853872, "grad_norm": 1.6530218294736676, "learning_rate": 4.329937049776105e-05, "loss": 0.6458, "step": 4230 }, { "epoch": 0.41141082864350864, "grad_norm": 1.7371351925998049, "learning_rate": 4.3283146213251995e-05, "loss": 0.6085, "step": 4240 }, { "epoch": 0.4123811372016301, "grad_norm": 1.1986220099049816, "learning_rate": 4.3266921928742944e-05, "loss": 0.6763, "step": 4250 }, { "epoch": 0.4133514457597516, "grad_norm": 1.4910668696454408, "learning_rate": 4.325069764423389e-05, "loss": 0.6943, "step": 4260 }, { "epoch": 0.4143217543178731, "grad_norm": 1.605222234480949, "learning_rate": 4.323447335972484e-05, "loss": 0.5683, "step": 4270 }, { "epoch": 0.41529206287599457, "grad_norm": 1.6517210814862113, "learning_rate": 4.321824907521578e-05, "loss": 0.6121, "step": 4280 }, { "epoch": 0.41626237143411604, "grad_norm": 1.5780093620130797, "learning_rate": 4.320202479070673e-05, "loss": 0.634, "step": 4290 }, { "epoch": 0.4172326799922375, "grad_norm": 1.4948523984666717, "learning_rate": 4.318580050619768e-05, "loss": 0.6736, "step": 4300 }, { "epoch": 0.41820298855035903, "grad_norm": 1.412186726690487, "learning_rate": 4.316957622168862e-05, "loss": 0.6645, "step": 4310 }, { "epoch": 0.4191732971084805, "grad_norm": 1.3014470286002153, "learning_rate": 4.315335193717957e-05, "loss": 0.611, "step": 4320 }, { "epoch": 0.42014360566660197, "grad_norm": 1.5892208566989257, "learning_rate": 4.3137127652670515e-05, "loss": 0.6545, "step": 4330 }, { "epoch": 0.42111391422472344, "grad_norm": 1.912773548887141, "learning_rate": 4.3120903368161465e-05, "loss": 0.6454, "step": 4340 }, { "epoch": 0.42208422278284496, "grad_norm": 1.7286922382345113, "learning_rate": 4.310467908365241e-05, "loss": 0.6591, "step": 4350 }, { "epoch": 0.42305453134096643, "grad_norm": 1.8839412216123284, "learning_rate": 4.308845479914336e-05, "loss": 0.5875, "step": 4360 }, { "epoch": 0.4240248398990879, "grad_norm": 1.433166987175659, "learning_rate": 4.30722305146343e-05, "loss": 0.6405, "step": 4370 }, { "epoch": 0.42499514845720937, "grad_norm": 1.493313820047122, "learning_rate": 4.305600623012525e-05, "loss": 0.6421, "step": 4380 }, { "epoch": 0.4259654570153309, "grad_norm": 1.470156638554737, "learning_rate": 4.30397819456162e-05, "loss": 0.6212, "step": 4390 }, { "epoch": 0.42693576557345236, "grad_norm": 1.801577389486602, "learning_rate": 4.3023557661107143e-05, "loss": 0.589, "step": 4400 }, { "epoch": 0.42790607413157383, "grad_norm": 1.6174671178388904, "learning_rate": 4.30073333765981e-05, "loss": 0.6612, "step": 4410 }, { "epoch": 0.4288763826896953, "grad_norm": 1.5169174455759806, "learning_rate": 4.299110909208904e-05, "loss": 0.6418, "step": 4420 }, { "epoch": 0.4298466912478168, "grad_norm": 1.7159303308076814, "learning_rate": 4.297488480757999e-05, "loss": 0.6363, "step": 4430 }, { "epoch": 0.4308169998059383, "grad_norm": 1.9158386115366701, "learning_rate": 4.2958660523070936e-05, "loss": 0.6639, "step": 4440 }, { "epoch": 0.43178730836405976, "grad_norm": 1.6614034382324376, "learning_rate": 4.2942436238561885e-05, "loss": 0.6546, "step": 4450 }, { "epoch": 0.43275761692218123, "grad_norm": 1.528595189219001, "learning_rate": 4.292621195405283e-05, "loss": 0.6151, "step": 4460 }, { "epoch": 0.43372792548030276, "grad_norm": 1.297393909600355, "learning_rate": 4.290998766954378e-05, "loss": 0.6142, "step": 4470 }, { "epoch": 0.4346982340384242, "grad_norm": 1.6025277242190177, "learning_rate": 4.289376338503472e-05, "loss": 0.6415, "step": 4480 }, { "epoch": 0.4356685425965457, "grad_norm": 1.550877285078, "learning_rate": 4.287753910052567e-05, "loss": 0.6896, "step": 4490 }, { "epoch": 0.43663885115466716, "grad_norm": 1.8134887382719538, "learning_rate": 4.286131481601662e-05, "loss": 0.674, "step": 4500 }, { "epoch": 0.43663885115466716, "eval_loss": 0.70569908618927, "eval_runtime": 2470.7726, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.363, "step": 4500 }, { "epoch": 0.4376091597127887, "grad_norm": 1.5715320462855416, "learning_rate": 4.2845090531507564e-05, "loss": 0.6107, "step": 4510 }, { "epoch": 0.43857946827091016, "grad_norm": 1.659219569116577, "learning_rate": 4.2828866246998514e-05, "loss": 0.554, "step": 4520 }, { "epoch": 0.4395497768290316, "grad_norm": 1.5014148593583845, "learning_rate": 4.2812641962489457e-05, "loss": 0.6541, "step": 4530 }, { "epoch": 0.4405200853871531, "grad_norm": 1.2080187388338057, "learning_rate": 4.2796417677980406e-05, "loss": 0.6704, "step": 4540 }, { "epoch": 0.4414903939452746, "grad_norm": 1.4344070923299042, "learning_rate": 4.278019339347135e-05, "loss": 0.637, "step": 4550 }, { "epoch": 0.4424607025033961, "grad_norm": 1.9816113300875973, "learning_rate": 4.27639691089623e-05, "loss": 0.6036, "step": 4560 }, { "epoch": 0.44343101106151755, "grad_norm": 1.270379951275499, "learning_rate": 4.274774482445324e-05, "loss": 0.6481, "step": 4570 }, { "epoch": 0.444401319619639, "grad_norm": 1.559280795643455, "learning_rate": 4.273152053994419e-05, "loss": 0.62, "step": 4580 }, { "epoch": 0.44537162817776055, "grad_norm": 1.8377481053949813, "learning_rate": 4.271529625543514e-05, "loss": 0.6662, "step": 4590 }, { "epoch": 0.446341936735882, "grad_norm": 1.3952819132659193, "learning_rate": 4.2699071970926085e-05, "loss": 0.6969, "step": 4600 }, { "epoch": 0.4473122452940035, "grad_norm": 1.5967451654603113, "learning_rate": 4.2682847686417034e-05, "loss": 0.6604, "step": 4610 }, { "epoch": 0.44828255385212495, "grad_norm": 1.5849329398639342, "learning_rate": 4.266662340190798e-05, "loss": 0.6589, "step": 4620 }, { "epoch": 0.4492528624102465, "grad_norm": 1.7238645892594557, "learning_rate": 4.265039911739893e-05, "loss": 0.6658, "step": 4630 }, { "epoch": 0.45022317096836795, "grad_norm": 1.7849098949361206, "learning_rate": 4.263417483288987e-05, "loss": 0.6507, "step": 4640 }, { "epoch": 0.4511934795264894, "grad_norm": 1.3539873989398281, "learning_rate": 4.261795054838082e-05, "loss": 0.631, "step": 4650 }, { "epoch": 0.4521637880846109, "grad_norm": 1.6177606598072791, "learning_rate": 4.260172626387176e-05, "loss": 0.7667, "step": 4660 }, { "epoch": 0.4531340966427324, "grad_norm": 1.7683798129102917, "learning_rate": 4.258550197936271e-05, "loss": 0.6182, "step": 4670 }, { "epoch": 0.4541044052008539, "grad_norm": 1.5536790285936453, "learning_rate": 4.256927769485366e-05, "loss": 0.6515, "step": 4680 }, { "epoch": 0.45507471375897535, "grad_norm": 1.4626963492189242, "learning_rate": 4.2553053410344605e-05, "loss": 0.5595, "step": 4690 }, { "epoch": 0.4560450223170968, "grad_norm": 1.8455989589758681, "learning_rate": 4.2536829125835555e-05, "loss": 0.6382, "step": 4700 }, { "epoch": 0.45701533087521834, "grad_norm": 1.8260482347716946, "learning_rate": 4.25206048413265e-05, "loss": 0.6708, "step": 4710 }, { "epoch": 0.4579856394333398, "grad_norm": 1.506588655076192, "learning_rate": 4.250438055681745e-05, "loss": 0.6138, "step": 4720 }, { "epoch": 0.4589559479914613, "grad_norm": 1.7415606928937182, "learning_rate": 4.248815627230839e-05, "loss": 0.6688, "step": 4730 }, { "epoch": 0.45992625654958275, "grad_norm": 1.2860177177533143, "learning_rate": 4.247193198779934e-05, "loss": 0.6496, "step": 4740 }, { "epoch": 0.46089656510770427, "grad_norm": 1.419953340190783, "learning_rate": 4.245570770329029e-05, "loss": 0.6399, "step": 4750 }, { "epoch": 0.46186687366582574, "grad_norm": 1.4197804283366926, "learning_rate": 4.2439483418781234e-05, "loss": 0.572, "step": 4760 }, { "epoch": 0.4628371822239472, "grad_norm": 1.1761100117238914, "learning_rate": 4.242325913427218e-05, "loss": 0.6099, "step": 4770 }, { "epoch": 0.4638074907820687, "grad_norm": 1.5767612487613212, "learning_rate": 4.2407034849763126e-05, "loss": 0.6223, "step": 4780 }, { "epoch": 0.4647777993401902, "grad_norm": 1.8050008733247063, "learning_rate": 4.2390810565254076e-05, "loss": 0.5695, "step": 4790 }, { "epoch": 0.46574810789831167, "grad_norm": 1.3088703288484584, "learning_rate": 4.237458628074502e-05, "loss": 0.5723, "step": 4800 }, { "epoch": 0.46671841645643314, "grad_norm": 1.7711140961973422, "learning_rate": 4.235836199623597e-05, "loss": 0.6173, "step": 4810 }, { "epoch": 0.4676887250145546, "grad_norm": 1.6431929005817145, "learning_rate": 4.234213771172691e-05, "loss": 0.5982, "step": 4820 }, { "epoch": 0.46865903357267613, "grad_norm": 1.557431489902951, "learning_rate": 4.232591342721786e-05, "loss": 0.6098, "step": 4830 }, { "epoch": 0.4696293421307976, "grad_norm": 1.4479656995240782, "learning_rate": 4.230968914270881e-05, "loss": 0.5699, "step": 4840 }, { "epoch": 0.47059965068891907, "grad_norm": 1.6046344258439647, "learning_rate": 4.2293464858199754e-05, "loss": 0.6243, "step": 4850 }, { "epoch": 0.47156995924704054, "grad_norm": 1.252218532607539, "learning_rate": 4.2277240573690704e-05, "loss": 0.5787, "step": 4860 }, { "epoch": 0.47254026780516206, "grad_norm": 1.4372595855894126, "learning_rate": 4.226101628918165e-05, "loss": 0.6483, "step": 4870 }, { "epoch": 0.47351057636328353, "grad_norm": 1.6112811676393963, "learning_rate": 4.22447920046726e-05, "loss": 0.6042, "step": 4880 }, { "epoch": 0.474480884921405, "grad_norm": 1.6387669351547591, "learning_rate": 4.222856772016354e-05, "loss": 0.6176, "step": 4890 }, { "epoch": 0.47545119347952647, "grad_norm": 1.4893296879883764, "learning_rate": 4.221234343565449e-05, "loss": 0.5976, "step": 4900 }, { "epoch": 0.476421502037648, "grad_norm": 1.663980815080282, "learning_rate": 4.219611915114543e-05, "loss": 0.6655, "step": 4910 }, { "epoch": 0.47739181059576946, "grad_norm": 1.3387695933594599, "learning_rate": 4.217989486663638e-05, "loss": 0.6166, "step": 4920 }, { "epoch": 0.47836211915389093, "grad_norm": 1.7319521884893254, "learning_rate": 4.216367058212733e-05, "loss": 0.6225, "step": 4930 }, { "epoch": 0.4793324277120124, "grad_norm": 1.4563922038982844, "learning_rate": 4.2147446297618275e-05, "loss": 0.6351, "step": 4940 }, { "epoch": 0.4803027362701339, "grad_norm": 1.7473358339447822, "learning_rate": 4.2131222013109225e-05, "loss": 0.6205, "step": 4950 }, { "epoch": 0.4812730448282554, "grad_norm": 1.4006136660964967, "learning_rate": 4.211499772860017e-05, "loss": 0.6992, "step": 4960 }, { "epoch": 0.48224335338637686, "grad_norm": 1.7446969484471915, "learning_rate": 4.209877344409112e-05, "loss": 0.6251, "step": 4970 }, { "epoch": 0.48321366194449833, "grad_norm": 1.3789648220512414, "learning_rate": 4.208254915958206e-05, "loss": 0.5856, "step": 4980 }, { "epoch": 0.48418397050261985, "grad_norm": 1.77919007957107, "learning_rate": 4.206632487507301e-05, "loss": 0.621, "step": 4990 }, { "epoch": 0.4851542790607413, "grad_norm": 1.5926084629766706, "learning_rate": 4.2050100590563953e-05, "loss": 0.6542, "step": 5000 }, { "epoch": 0.4851542790607413, "eval_loss": 0.6975318789482117, "eval_runtime": 2469.0533, "eval_samples_per_second": 0.726, "eval_steps_per_second": 0.363, "step": 5000 }, { "epoch": 0.4861245876188628, "grad_norm": 1.7733999980186248, "learning_rate": 4.20338763060549e-05, "loss": 0.6943, "step": 5010 }, { "epoch": 0.48709489617698426, "grad_norm": 1.7221032223753934, "learning_rate": 4.201765202154585e-05, "loss": 0.6338, "step": 5020 }, { "epoch": 0.4880652047351058, "grad_norm": 1.7180735514930228, "learning_rate": 4.2001427737036796e-05, "loss": 0.6698, "step": 5030 }, { "epoch": 0.48903551329322725, "grad_norm": 1.698242629280347, "learning_rate": 4.1985203452527746e-05, "loss": 0.6972, "step": 5040 }, { "epoch": 0.4900058218513487, "grad_norm": 1.8269714939747912, "learning_rate": 4.196897916801869e-05, "loss": 0.6212, "step": 5050 }, { "epoch": 0.4909761304094702, "grad_norm": 1.4693827904505679, "learning_rate": 4.195275488350964e-05, "loss": 0.5933, "step": 5060 }, { "epoch": 0.4919464389675917, "grad_norm": 1.49361517085645, "learning_rate": 4.193653059900058e-05, "loss": 0.5489, "step": 5070 }, { "epoch": 0.4929167475257132, "grad_norm": 1.5630723710374232, "learning_rate": 4.192030631449153e-05, "loss": 0.6578, "step": 5080 }, { "epoch": 0.49388705608383465, "grad_norm": 1.6495843346442778, "learning_rate": 4.1904082029982474e-05, "loss": 0.6906, "step": 5090 }, { "epoch": 0.4948573646419561, "grad_norm": 1.7983678208711196, "learning_rate": 4.1887857745473424e-05, "loss": 0.5486, "step": 5100 }, { "epoch": 0.49582767320007765, "grad_norm": 2.00722752034129, "learning_rate": 4.1871633460964374e-05, "loss": 0.648, "step": 5110 }, { "epoch": 0.4967979817581991, "grad_norm": 1.7918897206429327, "learning_rate": 4.185540917645532e-05, "loss": 0.6693, "step": 5120 }, { "epoch": 0.4977682903163206, "grad_norm": 1.3410097713324785, "learning_rate": 4.1839184891946267e-05, "loss": 0.6065, "step": 5130 }, { "epoch": 0.49873859887444205, "grad_norm": 1.8485275360772722, "learning_rate": 4.182296060743721e-05, "loss": 0.6802, "step": 5140 }, { "epoch": 0.4997089074325636, "grad_norm": 1.923253820990962, "learning_rate": 4.180673632292816e-05, "loss": 0.6014, "step": 5150 }, { "epoch": 0.500679215990685, "grad_norm": 1.325223633752785, "learning_rate": 4.17905120384191e-05, "loss": 0.5787, "step": 5160 }, { "epoch": 0.5016495245488065, "grad_norm": 1.7937826702440574, "learning_rate": 4.177428775391005e-05, "loss": 0.6197, "step": 5170 }, { "epoch": 0.502619833106928, "grad_norm": 1.4050667202069218, "learning_rate": 4.1758063469401e-05, "loss": 0.6, "step": 5180 }, { "epoch": 0.5035901416650495, "grad_norm": 1.5656335055581363, "learning_rate": 4.174183918489195e-05, "loss": 0.6924, "step": 5190 }, { "epoch": 0.504560450223171, "grad_norm": 1.3325372640233277, "learning_rate": 4.1725614900382895e-05, "loss": 0.6477, "step": 5200 }, { "epoch": 0.5055307587812925, "grad_norm": 1.5612035852842387, "learning_rate": 4.1709390615873844e-05, "loss": 0.6372, "step": 5210 }, { "epoch": 0.5065010673394139, "grad_norm": 1.6164770277307943, "learning_rate": 4.1693166331364794e-05, "loss": 0.5822, "step": 5220 }, { "epoch": 0.5074713758975354, "grad_norm": 1.8957196740567803, "learning_rate": 4.167694204685574e-05, "loss": 0.5695, "step": 5230 }, { "epoch": 0.5084416844556569, "grad_norm": 1.505057761308431, "learning_rate": 4.166071776234669e-05, "loss": 0.5893, "step": 5240 }, { "epoch": 0.5094119930137784, "grad_norm": 1.2392435413477463, "learning_rate": 4.164449347783763e-05, "loss": 0.6641, "step": 5250 }, { "epoch": 0.5103823015718999, "grad_norm": 1.8002682332596205, "learning_rate": 4.162826919332858e-05, "loss": 0.5892, "step": 5260 }, { "epoch": 0.5113526101300213, "grad_norm": 1.5161558066332184, "learning_rate": 4.161204490881952e-05, "loss": 0.5985, "step": 5270 }, { "epoch": 0.5123229186881428, "grad_norm": 1.3773663290971068, "learning_rate": 4.159582062431047e-05, "loss": 0.6244, "step": 5280 }, { "epoch": 0.5132932272462644, "grad_norm": 1.8649060352228473, "learning_rate": 4.1579596339801415e-05, "loss": 0.6144, "step": 5290 }, { "epoch": 0.5142635358043858, "grad_norm": 1.711468575333758, "learning_rate": 4.1563372055292365e-05, "loss": 0.5889, "step": 5300 }, { "epoch": 0.5152338443625073, "grad_norm": 1.4531536302097425, "learning_rate": 4.1547147770783315e-05, "loss": 0.6063, "step": 5310 }, { "epoch": 0.5162041529206287, "grad_norm": 1.6628035885794548, "learning_rate": 4.153092348627426e-05, "loss": 0.5872, "step": 5320 }, { "epoch": 0.5171744614787502, "grad_norm": 1.43386966167638, "learning_rate": 4.151469920176521e-05, "loss": 0.5836, "step": 5330 }, { "epoch": 0.5181447700368718, "grad_norm": 1.6338584720484304, "learning_rate": 4.149847491725615e-05, "loss": 0.583, "step": 5340 }, { "epoch": 0.5191150785949932, "grad_norm": 1.602663089927568, "learning_rate": 4.14822506327471e-05, "loss": 0.6466, "step": 5350 }, { "epoch": 0.5200853871531147, "grad_norm": 0.993656188560276, "learning_rate": 4.1466026348238044e-05, "loss": 0.5972, "step": 5360 }, { "epoch": 0.5210556957112362, "grad_norm": 1.5353879720655148, "learning_rate": 4.144980206372899e-05, "loss": 0.6159, "step": 5370 }, { "epoch": 0.5220260042693576, "grad_norm": 1.4710027502404226, "learning_rate": 4.143357777921994e-05, "loss": 0.6231, "step": 5380 }, { "epoch": 0.5229963128274792, "grad_norm": 1.3154974116618938, "learning_rate": 4.1417353494710886e-05, "loss": 0.6414, "step": 5390 }, { "epoch": 0.5239666213856006, "grad_norm": 1.5917256061619933, "learning_rate": 4.1401129210201836e-05, "loss": 0.5574, "step": 5400 }, { "epoch": 0.5249369299437221, "grad_norm": 1.999701613939348, "learning_rate": 4.138490492569278e-05, "loss": 0.6659, "step": 5410 }, { "epoch": 0.5259072385018436, "grad_norm": 1.158059375940914, "learning_rate": 4.136868064118373e-05, "loss": 0.5702, "step": 5420 }, { "epoch": 0.526877547059965, "grad_norm": 1.6503529559993917, "learning_rate": 4.135245635667467e-05, "loss": 0.5866, "step": 5430 }, { "epoch": 0.5278478556180866, "grad_norm": 1.778595452682844, "learning_rate": 4.133623207216562e-05, "loss": 0.6727, "step": 5440 }, { "epoch": 0.5288181641762081, "grad_norm": 2.153378806689067, "learning_rate": 4.1320007787656564e-05, "loss": 0.5849, "step": 5450 }, { "epoch": 0.5297884727343295, "grad_norm": 1.5480145123110607, "learning_rate": 4.1303783503147514e-05, "loss": 0.5985, "step": 5460 }, { "epoch": 0.530758781292451, "grad_norm": 1.3074515838089584, "learning_rate": 4.1287559218638464e-05, "loss": 0.6567, "step": 5470 }, { "epoch": 0.5317290898505724, "grad_norm": 1.2634236320868193, "learning_rate": 4.127133493412941e-05, "loss": 0.6744, "step": 5480 }, { "epoch": 0.532699398408694, "grad_norm": 1.4158920202942755, "learning_rate": 4.125511064962036e-05, "loss": 0.6089, "step": 5490 }, { "epoch": 0.5336697069668155, "grad_norm": 1.530835103672291, "learning_rate": 4.12388863651113e-05, "loss": 0.6637, "step": 5500 }, { "epoch": 0.5336697069668155, "eval_loss": 0.6922717094421387, "eval_runtime": 2471.913, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.362, "step": 5500 }, { "epoch": 0.5346400155249369, "grad_norm": 1.0979844633621818, "learning_rate": 4.122266208060225e-05, "loss": 0.6711, "step": 5510 }, { "epoch": 0.5356103240830584, "grad_norm": 1.9531703151807043, "learning_rate": 4.120643779609319e-05, "loss": 0.634, "step": 5520 }, { "epoch": 0.5365806326411799, "grad_norm": 1.4345990864009392, "learning_rate": 4.119021351158414e-05, "loss": 0.6289, "step": 5530 }, { "epoch": 0.5375509411993014, "grad_norm": 1.5002874747132873, "learning_rate": 4.1173989227075085e-05, "loss": 0.6655, "step": 5540 }, { "epoch": 0.5385212497574229, "grad_norm": 1.4114133378596565, "learning_rate": 4.1157764942566035e-05, "loss": 0.6314, "step": 5550 }, { "epoch": 0.5394915583155443, "grad_norm": 1.2792208401822267, "learning_rate": 4.1141540658056985e-05, "loss": 0.5622, "step": 5560 }, { "epoch": 0.5404618668736658, "grad_norm": 1.4436849148969537, "learning_rate": 4.112531637354793e-05, "loss": 0.6758, "step": 5570 }, { "epoch": 0.5414321754317873, "grad_norm": 1.495469226735889, "learning_rate": 4.110909208903888e-05, "loss": 0.6457, "step": 5580 }, { "epoch": 0.5424024839899088, "grad_norm": 1.5696729181281173, "learning_rate": 4.109286780452982e-05, "loss": 0.5434, "step": 5590 }, { "epoch": 0.5433727925480303, "grad_norm": 1.4884099597371836, "learning_rate": 4.107664352002077e-05, "loss": 0.6602, "step": 5600 }, { "epoch": 0.5443431011061518, "grad_norm": 1.364422329620206, "learning_rate": 4.106041923551171e-05, "loss": 0.6093, "step": 5610 }, { "epoch": 0.5453134096642732, "grad_norm": 1.710525492450016, "learning_rate": 4.104419495100266e-05, "loss": 0.5733, "step": 5620 }, { "epoch": 0.5462837182223947, "grad_norm": 1.8645615729328109, "learning_rate": 4.1027970666493606e-05, "loss": 0.6301, "step": 5630 }, { "epoch": 0.5472540267805162, "grad_norm": 2.1212649730065403, "learning_rate": 4.1011746381984556e-05, "loss": 0.6489, "step": 5640 }, { "epoch": 0.5482243353386377, "grad_norm": 1.3978835293983003, "learning_rate": 4.0995522097475506e-05, "loss": 0.6651, "step": 5650 }, { "epoch": 0.5491946438967592, "grad_norm": 1.968855731350402, "learning_rate": 4.097929781296645e-05, "loss": 0.5849, "step": 5660 }, { "epoch": 0.5501649524548806, "grad_norm": 1.3003479487793523, "learning_rate": 4.09630735284574e-05, "loss": 0.5981, "step": 5670 }, { "epoch": 0.5511352610130021, "grad_norm": 1.3093675797291962, "learning_rate": 4.094684924394834e-05, "loss": 0.6001, "step": 5680 }, { "epoch": 0.5521055695711237, "grad_norm": 1.386619668616355, "learning_rate": 4.093062495943929e-05, "loss": 0.587, "step": 5690 }, { "epoch": 0.5530758781292451, "grad_norm": 2.1925053494451108, "learning_rate": 4.0914400674930234e-05, "loss": 0.6748, "step": 5700 }, { "epoch": 0.5540461866873666, "grad_norm": 1.401533269192193, "learning_rate": 4.0898176390421184e-05, "loss": 0.5987, "step": 5710 }, { "epoch": 0.555016495245488, "grad_norm": 1.8417454170154262, "learning_rate": 4.088195210591213e-05, "loss": 0.6169, "step": 5720 }, { "epoch": 0.5559868038036095, "grad_norm": 1.70211748961807, "learning_rate": 4.086572782140308e-05, "loss": 0.6231, "step": 5730 }, { "epoch": 0.5569571123617311, "grad_norm": 1.449836249162655, "learning_rate": 4.0849503536894026e-05, "loss": 0.6636, "step": 5740 }, { "epoch": 0.5579274209198525, "grad_norm": 1.411704552119965, "learning_rate": 4.083327925238497e-05, "loss": 0.6334, "step": 5750 }, { "epoch": 0.558897729477974, "grad_norm": 1.3373247464479125, "learning_rate": 4.081705496787592e-05, "loss": 0.6127, "step": 5760 }, { "epoch": 0.5598680380360955, "grad_norm": 1.6669305203635734, "learning_rate": 4.080083068336686e-05, "loss": 0.6419, "step": 5770 }, { "epoch": 0.5608383465942169, "grad_norm": 1.3599639094085172, "learning_rate": 4.078460639885781e-05, "loss": 0.6838, "step": 5780 }, { "epoch": 0.5618086551523385, "grad_norm": 1.5754322431175416, "learning_rate": 4.0768382114348755e-05, "loss": 0.5997, "step": 5790 }, { "epoch": 0.5627789637104599, "grad_norm": 1.667207521590911, "learning_rate": 4.0752157829839705e-05, "loss": 0.6199, "step": 5800 }, { "epoch": 0.5637492722685814, "grad_norm": 1.5869372089733027, "learning_rate": 4.0735933545330654e-05, "loss": 0.5779, "step": 5810 }, { "epoch": 0.5647195808267029, "grad_norm": 1.7150097071735784, "learning_rate": 4.07197092608216e-05, "loss": 0.5978, "step": 5820 }, { "epoch": 0.5656898893848243, "grad_norm": 1.1835353092728575, "learning_rate": 4.070348497631255e-05, "loss": 0.6781, "step": 5830 }, { "epoch": 0.5666601979429459, "grad_norm": 1.7307613010314937, "learning_rate": 4.068726069180349e-05, "loss": 0.634, "step": 5840 }, { "epoch": 0.5676305065010674, "grad_norm": 1.6364191249486493, "learning_rate": 4.067103640729444e-05, "loss": 0.6377, "step": 5850 }, { "epoch": 0.5686008150591888, "grad_norm": 1.4754833764988036, "learning_rate": 4.065481212278538e-05, "loss": 0.6148, "step": 5860 }, { "epoch": 0.5695711236173103, "grad_norm": 1.8389583610281375, "learning_rate": 4.063858783827633e-05, "loss": 0.6739, "step": 5870 }, { "epoch": 0.5705414321754317, "grad_norm": 1.4669099461048227, "learning_rate": 4.0622363553767276e-05, "loss": 0.598, "step": 5880 }, { "epoch": 0.5715117407335533, "grad_norm": 1.609973267529918, "learning_rate": 4.0606139269258226e-05, "loss": 0.5476, "step": 5890 }, { "epoch": 0.5724820492916748, "grad_norm": 1.6951117185016165, "learning_rate": 4.0589914984749175e-05, "loss": 0.595, "step": 5900 }, { "epoch": 0.5734523578497962, "grad_norm": 1.494158253886906, "learning_rate": 4.057369070024012e-05, "loss": 0.6886, "step": 5910 }, { "epoch": 0.5744226664079177, "grad_norm": 1.3036559330975726, "learning_rate": 4.055746641573107e-05, "loss": 0.5986, "step": 5920 }, { "epoch": 0.5753929749660393, "grad_norm": 1.4737709906961922, "learning_rate": 4.054124213122201e-05, "loss": 0.615, "step": 5930 }, { "epoch": 0.5763632835241607, "grad_norm": 1.4022175413897375, "learning_rate": 4.052501784671296e-05, "loss": 0.6398, "step": 5940 }, { "epoch": 0.5773335920822822, "grad_norm": 1.433959705605443, "learning_rate": 4.0508793562203904e-05, "loss": 0.6415, "step": 5950 }, { "epoch": 0.5783039006404036, "grad_norm": 1.7123129741732308, "learning_rate": 4.0492569277694854e-05, "loss": 0.6216, "step": 5960 }, { "epoch": 0.5792742091985251, "grad_norm": 2.131052124619936, "learning_rate": 4.04763449931858e-05, "loss": 0.5596, "step": 5970 }, { "epoch": 0.5802445177566466, "grad_norm": 1.2233223645362516, "learning_rate": 4.046012070867675e-05, "loss": 0.6023, "step": 5980 }, { "epoch": 0.5812148263147681, "grad_norm": 1.734811390420522, "learning_rate": 4.0443896424167696e-05, "loss": 0.6347, "step": 5990 }, { "epoch": 0.5821851348728896, "grad_norm": 1.626024609025115, "learning_rate": 4.0427672139658646e-05, "loss": 0.5688, "step": 6000 }, { "epoch": 0.5821851348728896, "eval_loss": 0.6903010606765747, "eval_runtime": 2473.5711, "eval_samples_per_second": 0.724, "eval_steps_per_second": 0.362, "step": 6000 }, { "epoch": 0.5831554434310111, "grad_norm": 1.3892850877811105, "learning_rate": 4.0411447855149596e-05, "loss": 0.6131, "step": 6010 }, { "epoch": 0.5841257519891325, "grad_norm": 1.6290044308163973, "learning_rate": 4.039522357064054e-05, "loss": 0.6319, "step": 6020 }, { "epoch": 0.585096060547254, "grad_norm": 1.7770012777694764, "learning_rate": 4.037899928613149e-05, "loss": 0.5756, "step": 6030 }, { "epoch": 0.5860663691053755, "grad_norm": 0.9954028008191703, "learning_rate": 4.036277500162243e-05, "loss": 0.6161, "step": 6040 }, { "epoch": 0.587036677663497, "grad_norm": 1.7893541919562175, "learning_rate": 4.034655071711338e-05, "loss": 0.62, "step": 6050 }, { "epoch": 0.5880069862216185, "grad_norm": 1.6500512739247042, "learning_rate": 4.0330326432604324e-05, "loss": 0.5928, "step": 6060 }, { "epoch": 0.5889772947797399, "grad_norm": 1.6568317215206447, "learning_rate": 4.0314102148095274e-05, "loss": 0.6043, "step": 6070 }, { "epoch": 0.5899476033378614, "grad_norm": 2.006336465212855, "learning_rate": 4.029787786358622e-05, "loss": 0.6283, "step": 6080 }, { "epoch": 0.590917911895983, "grad_norm": 1.481658511307882, "learning_rate": 4.028165357907717e-05, "loss": 0.6229, "step": 6090 }, { "epoch": 0.5918882204541044, "grad_norm": 1.7467752898199094, "learning_rate": 4.0265429294568117e-05, "loss": 0.5997, "step": 6100 }, { "epoch": 0.5928585290122259, "grad_norm": 1.747997366501937, "learning_rate": 4.024920501005906e-05, "loss": 0.6441, "step": 6110 }, { "epoch": 0.5938288375703473, "grad_norm": 1.6963969244893895, "learning_rate": 4.023298072555001e-05, "loss": 0.6059, "step": 6120 }, { "epoch": 0.5947991461284688, "grad_norm": 1.335184560881826, "learning_rate": 4.021675644104095e-05, "loss": 0.5627, "step": 6130 }, { "epoch": 0.5957694546865904, "grad_norm": 1.5256295408049876, "learning_rate": 4.02005321565319e-05, "loss": 0.578, "step": 6140 }, { "epoch": 0.5967397632447118, "grad_norm": 1.7615333752357474, "learning_rate": 4.0184307872022845e-05, "loss": 0.6046, "step": 6150 }, { "epoch": 0.5977100718028333, "grad_norm": 1.9321393707932597, "learning_rate": 4.0168083587513795e-05, "loss": 0.615, "step": 6160 }, { "epoch": 0.5986803803609548, "grad_norm": 1.2267052175289888, "learning_rate": 4.015185930300474e-05, "loss": 0.6238, "step": 6170 }, { "epoch": 0.5996506889190762, "grad_norm": 1.8092624768121868, "learning_rate": 4.013563501849569e-05, "loss": 0.6514, "step": 6180 }, { "epoch": 0.6006209974771978, "grad_norm": 1.546108807816704, "learning_rate": 4.011941073398664e-05, "loss": 0.6071, "step": 6190 }, { "epoch": 0.6015913060353192, "grad_norm": 1.8186789877657976, "learning_rate": 4.010318644947758e-05, "loss": 0.6624, "step": 6200 }, { "epoch": 0.6025616145934407, "grad_norm": 1.6349446430924426, "learning_rate": 4.008696216496853e-05, "loss": 0.6208, "step": 6210 }, { "epoch": 0.6035319231515622, "grad_norm": 1.6574782074117176, "learning_rate": 4.007073788045947e-05, "loss": 0.6067, "step": 6220 }, { "epoch": 0.6045022317096836, "grad_norm": 1.372441926371392, "learning_rate": 4.005451359595042e-05, "loss": 0.5805, "step": 6230 }, { "epoch": 0.6054725402678052, "grad_norm": 2.1366961584133164, "learning_rate": 4.0038289311441366e-05, "loss": 0.6797, "step": 6240 }, { "epoch": 0.6064428488259267, "grad_norm": 1.8553936516223448, "learning_rate": 4.0022065026932316e-05, "loss": 0.582, "step": 6250 }, { "epoch": 0.6074131573840481, "grad_norm": 1.95303083914795, "learning_rate": 4.000584074242326e-05, "loss": 0.6474, "step": 6260 }, { "epoch": 0.6083834659421696, "grad_norm": 1.6884162400220937, "learning_rate": 3.998961645791421e-05, "loss": 0.6339, "step": 6270 }, { "epoch": 0.609353774500291, "grad_norm": 1.7825690304753616, "learning_rate": 3.997339217340516e-05, "loss": 0.5568, "step": 6280 }, { "epoch": 0.6103240830584126, "grad_norm": 1.5564493386693086, "learning_rate": 3.99571678888961e-05, "loss": 0.602, "step": 6290 }, { "epoch": 0.6112943916165341, "grad_norm": 1.667415447715766, "learning_rate": 3.994094360438705e-05, "loss": 0.5877, "step": 6300 }, { "epoch": 0.6122647001746555, "grad_norm": 1.6232084554425263, "learning_rate": 3.9924719319877994e-05, "loss": 0.6247, "step": 6310 }, { "epoch": 0.613235008732777, "grad_norm": 1.7806635141428606, "learning_rate": 3.9908495035368944e-05, "loss": 0.5651, "step": 6320 }, { "epoch": 0.6142053172908986, "grad_norm": 1.2898375089967073, "learning_rate": 3.989227075085989e-05, "loss": 0.6206, "step": 6330 }, { "epoch": 0.61517562584902, "grad_norm": 1.762789516395008, "learning_rate": 3.9876046466350836e-05, "loss": 0.5538, "step": 6340 }, { "epoch": 0.6161459344071415, "grad_norm": 1.8092369800491652, "learning_rate": 3.985982218184178e-05, "loss": 0.6236, "step": 6350 }, { "epoch": 0.6171162429652629, "grad_norm": 1.8060820825881239, "learning_rate": 3.984359789733273e-05, "loss": 0.5875, "step": 6360 }, { "epoch": 0.6180865515233844, "grad_norm": 1.7317977897542403, "learning_rate": 3.982737361282368e-05, "loss": 0.594, "step": 6370 }, { "epoch": 0.619056860081506, "grad_norm": 1.5504516693013288, "learning_rate": 3.981114932831462e-05, "loss": 0.5833, "step": 6380 }, { "epoch": 0.6200271686396274, "grad_norm": 1.3714460442712506, "learning_rate": 3.979492504380557e-05, "loss": 0.6048, "step": 6390 }, { "epoch": 0.6209974771977489, "grad_norm": 1.741408754147969, "learning_rate": 3.9778700759296515e-05, "loss": 0.6729, "step": 6400 }, { "epoch": 0.6219677857558704, "grad_norm": 1.0600787702033703, "learning_rate": 3.9762476474787465e-05, "loss": 0.6934, "step": 6410 }, { "epoch": 0.6229380943139918, "grad_norm": 1.5722449067246123, "learning_rate": 3.974625219027841e-05, "loss": 0.6043, "step": 6420 }, { "epoch": 0.6239084028721134, "grad_norm": 1.3176596782485965, "learning_rate": 3.973002790576936e-05, "loss": 0.5898, "step": 6430 }, { "epoch": 0.6248787114302348, "grad_norm": 1.4177190750303141, "learning_rate": 3.971380362126031e-05, "loss": 0.6146, "step": 6440 }, { "epoch": 0.6258490199883563, "grad_norm": 1.3993450966845369, "learning_rate": 3.969757933675125e-05, "loss": 0.608, "step": 6450 }, { "epoch": 0.6268193285464778, "grad_norm": 1.6552784809097676, "learning_rate": 3.96813550522422e-05, "loss": 0.6078, "step": 6460 }, { "epoch": 0.6277896371045992, "grad_norm": 1.315768136368434, "learning_rate": 3.966513076773314e-05, "loss": 0.6021, "step": 6470 }, { "epoch": 0.6287599456627208, "grad_norm": 1.431816586824017, "learning_rate": 3.964890648322409e-05, "loss": 0.5571, "step": 6480 }, { "epoch": 0.6297302542208423, "grad_norm": 1.4950226300857892, "learning_rate": 3.9632682198715036e-05, "loss": 0.6297, "step": 6490 }, { "epoch": 0.6307005627789637, "grad_norm": 2.0781333722978284, "learning_rate": 3.9616457914205985e-05, "loss": 0.596, "step": 6500 }, { "epoch": 0.6307005627789637, "eval_loss": 0.6844401955604553, "eval_runtime": 2468.3658, "eval_samples_per_second": 0.726, "eval_steps_per_second": 0.363, "step": 6500 }, { "epoch": 0.6316708713370852, "grad_norm": 1.5337482284796442, "learning_rate": 3.960023362969693e-05, "loss": 0.6634, "step": 6510 }, { "epoch": 0.6326411798952066, "grad_norm": 1.7040234243162709, "learning_rate": 3.958400934518788e-05, "loss": 0.5936, "step": 6520 }, { "epoch": 0.6336114884533282, "grad_norm": 1.7943789476429368, "learning_rate": 3.956778506067883e-05, "loss": 0.6361, "step": 6530 }, { "epoch": 0.6345817970114497, "grad_norm": 1.5737933588874193, "learning_rate": 3.955156077616977e-05, "loss": 0.6192, "step": 6540 }, { "epoch": 0.6355521055695711, "grad_norm": 1.532774889369556, "learning_rate": 3.953533649166072e-05, "loss": 0.5694, "step": 6550 }, { "epoch": 0.6365224141276926, "grad_norm": 1.6854751291833254, "learning_rate": 3.9519112207151664e-05, "loss": 0.6365, "step": 6560 }, { "epoch": 0.6374927226858141, "grad_norm": 1.4850693595634958, "learning_rate": 3.9502887922642613e-05, "loss": 0.5966, "step": 6570 }, { "epoch": 0.6384630312439356, "grad_norm": 1.5032167913978984, "learning_rate": 3.9486663638133556e-05, "loss": 0.6413, "step": 6580 }, { "epoch": 0.6394333398020571, "grad_norm": 1.4058296063426399, "learning_rate": 3.9470439353624506e-05, "loss": 0.5845, "step": 6590 }, { "epoch": 0.6404036483601785, "grad_norm": 1.563521067932564, "learning_rate": 3.945421506911545e-05, "loss": 0.6179, "step": 6600 }, { "epoch": 0.6413739569183, "grad_norm": 1.4036808350972751, "learning_rate": 3.94379907846064e-05, "loss": 0.5942, "step": 6610 }, { "epoch": 0.6423442654764215, "grad_norm": 1.4964922216668848, "learning_rate": 3.942176650009735e-05, "loss": 0.6516, "step": 6620 }, { "epoch": 0.643314574034543, "grad_norm": 1.6801427039047954, "learning_rate": 3.940554221558829e-05, "loss": 0.6192, "step": 6630 }, { "epoch": 0.6442848825926645, "grad_norm": 1.582294622357827, "learning_rate": 3.938931793107924e-05, "loss": 0.6051, "step": 6640 }, { "epoch": 0.645255191150786, "grad_norm": 1.531859995817891, "learning_rate": 3.9373093646570185e-05, "loss": 0.6407, "step": 6650 }, { "epoch": 0.6462254997089074, "grad_norm": 1.9088041020547684, "learning_rate": 3.9356869362061134e-05, "loss": 0.6078, "step": 6660 }, { "epoch": 0.6471958082670289, "grad_norm": 1.3066981837793257, "learning_rate": 3.934064507755208e-05, "loss": 0.5665, "step": 6670 }, { "epoch": 0.6481661168251504, "grad_norm": 1.313352722546981, "learning_rate": 3.932442079304303e-05, "loss": 0.5201, "step": 6680 }, { "epoch": 0.6491364253832719, "grad_norm": 1.8145770193834194, "learning_rate": 3.930819650853397e-05, "loss": 0.636, "step": 6690 }, { "epoch": 0.6501067339413934, "grad_norm": 1.6198734288858008, "learning_rate": 3.929197222402492e-05, "loss": 0.6529, "step": 6700 }, { "epoch": 0.6510770424995148, "grad_norm": 1.6029042771301745, "learning_rate": 3.927574793951587e-05, "loss": 0.5777, "step": 6710 }, { "epoch": 0.6520473510576363, "grad_norm": 1.7982640944263595, "learning_rate": 3.925952365500681e-05, "loss": 0.5833, "step": 6720 }, { "epoch": 0.6530176596157579, "grad_norm": 1.9034497019204402, "learning_rate": 3.924329937049776e-05, "loss": 0.5981, "step": 6730 }, { "epoch": 0.6539879681738793, "grad_norm": 1.3239435404528657, "learning_rate": 3.9227075085988705e-05, "loss": 0.5861, "step": 6740 }, { "epoch": 0.6549582767320008, "grad_norm": 1.7333381509447607, "learning_rate": 3.921085080147966e-05, "loss": 0.5696, "step": 6750 }, { "epoch": 0.6559285852901222, "grad_norm": 1.7408114949745195, "learning_rate": 3.9194626516970605e-05, "loss": 0.5868, "step": 6760 }, { "epoch": 0.6568988938482437, "grad_norm": 1.4140478484561587, "learning_rate": 3.9178402232461555e-05, "loss": 0.6126, "step": 6770 }, { "epoch": 0.6578692024063653, "grad_norm": 1.2359753742563322, "learning_rate": 3.91621779479525e-05, "loss": 0.595, "step": 6780 }, { "epoch": 0.6588395109644867, "grad_norm": 1.4986954153208918, "learning_rate": 3.914595366344345e-05, "loss": 0.6399, "step": 6790 }, { "epoch": 0.6598098195226082, "grad_norm": 1.60396773259511, "learning_rate": 3.912972937893439e-05, "loss": 0.573, "step": 6800 }, { "epoch": 0.6607801280807297, "grad_norm": 1.6446241731745532, "learning_rate": 3.911350509442534e-05, "loss": 0.5758, "step": 6810 }, { "epoch": 0.6617504366388511, "grad_norm": 1.6940327126152295, "learning_rate": 3.909728080991629e-05, "loss": 0.5646, "step": 6820 }, { "epoch": 0.6627207451969727, "grad_norm": 1.6725911752868794, "learning_rate": 3.908105652540723e-05, "loss": 0.6282, "step": 6830 }, { "epoch": 0.6636910537550941, "grad_norm": 1.724226678968073, "learning_rate": 3.906483224089818e-05, "loss": 0.6391, "step": 6840 }, { "epoch": 0.6646613623132156, "grad_norm": 1.5959383703623191, "learning_rate": 3.9048607956389126e-05, "loss": 0.6003, "step": 6850 }, { "epoch": 0.6656316708713371, "grad_norm": 1.7487148965006274, "learning_rate": 3.9032383671880075e-05, "loss": 0.6019, "step": 6860 }, { "epoch": 0.6666019794294585, "grad_norm": 2.0088682339975312, "learning_rate": 3.901615938737102e-05, "loss": 0.5846, "step": 6870 }, { "epoch": 0.6675722879875801, "grad_norm": 1.5646052877933723, "learning_rate": 3.899993510286197e-05, "loss": 0.6222, "step": 6880 }, { "epoch": 0.6685425965457016, "grad_norm": 1.5750290096746722, "learning_rate": 3.898371081835291e-05, "loss": 0.6018, "step": 6890 }, { "epoch": 0.669512905103823, "grad_norm": 1.2258053581934112, "learning_rate": 3.896748653384386e-05, "loss": 0.5697, "step": 6900 }, { "epoch": 0.6704832136619445, "grad_norm": 1.9376975148901512, "learning_rate": 3.895126224933481e-05, "loss": 0.6168, "step": 6910 }, { "epoch": 0.6714535222200659, "grad_norm": 1.4670771869188424, "learning_rate": 3.8935037964825754e-05, "loss": 0.5964, "step": 6920 }, { "epoch": 0.6724238307781875, "grad_norm": 1.324084539118919, "learning_rate": 3.8918813680316704e-05, "loss": 0.5346, "step": 6930 }, { "epoch": 0.673394139336309, "grad_norm": 1.717496207353836, "learning_rate": 3.8902589395807647e-05, "loss": 0.6112, "step": 6940 }, { "epoch": 0.6743644478944304, "grad_norm": 2.019875964877309, "learning_rate": 3.8886365111298596e-05, "loss": 0.6522, "step": 6950 }, { "epoch": 0.6753347564525519, "grad_norm": 1.3354957180640497, "learning_rate": 3.887014082678954e-05, "loss": 0.6256, "step": 6960 }, { "epoch": 0.6763050650106734, "grad_norm": 1.6937952057212555, "learning_rate": 3.885391654228049e-05, "loss": 0.6176, "step": 6970 }, { "epoch": 0.6772753735687949, "grad_norm": 1.5422166727550382, "learning_rate": 3.883769225777143e-05, "loss": 0.5546, "step": 6980 }, { "epoch": 0.6782456821269164, "grad_norm": 1.7417491098472007, "learning_rate": 3.882146797326238e-05, "loss": 0.6267, "step": 6990 }, { "epoch": 0.6792159906850378, "grad_norm": 1.5366921907221318, "learning_rate": 3.880524368875333e-05, "loss": 0.6218, "step": 7000 }, { "epoch": 0.6792159906850378, "eval_loss": 0.678726851940155, "eval_runtime": 2469.9254, "eval_samples_per_second": 0.726, "eval_steps_per_second": 0.363, "step": 7000 }, { "epoch": 0.6801862992431593, "grad_norm": 1.5097589222422163, "learning_rate": 3.8789019404244275e-05, "loss": 0.5925, "step": 7010 }, { "epoch": 0.6811566078012808, "grad_norm": 2.1865443717654256, "learning_rate": 3.8772795119735224e-05, "loss": 0.6125, "step": 7020 }, { "epoch": 0.6821269163594023, "grad_norm": 1.3913795509553841, "learning_rate": 3.875657083522617e-05, "loss": 0.6366, "step": 7030 }, { "epoch": 0.6830972249175238, "grad_norm": 1.314507220249943, "learning_rate": 3.874034655071712e-05, "loss": 0.5609, "step": 7040 }, { "epoch": 0.6840675334756453, "grad_norm": 1.5985439411272098, "learning_rate": 3.872412226620806e-05, "loss": 0.5946, "step": 7050 }, { "epoch": 0.6850378420337667, "grad_norm": 1.8453526914168807, "learning_rate": 3.870789798169901e-05, "loss": 0.5389, "step": 7060 }, { "epoch": 0.6860081505918882, "grad_norm": 1.746744853463753, "learning_rate": 3.869167369718995e-05, "loss": 0.5821, "step": 7070 }, { "epoch": 0.6869784591500097, "grad_norm": 1.351619112366914, "learning_rate": 3.86754494126809e-05, "loss": 0.6333, "step": 7080 }, { "epoch": 0.6879487677081312, "grad_norm": 1.6766693580749343, "learning_rate": 3.865922512817185e-05, "loss": 0.5905, "step": 7090 }, { "epoch": 0.6889190762662527, "grad_norm": 1.6907020127934513, "learning_rate": 3.8643000843662795e-05, "loss": 0.583, "step": 7100 }, { "epoch": 0.6898893848243741, "grad_norm": 1.8609743061215689, "learning_rate": 3.8626776559153745e-05, "loss": 0.6131, "step": 7110 }, { "epoch": 0.6908596933824956, "grad_norm": 1.3507375736553595, "learning_rate": 3.861055227464469e-05, "loss": 0.6399, "step": 7120 }, { "epoch": 0.6918300019406172, "grad_norm": 1.7786145489846785, "learning_rate": 3.859432799013564e-05, "loss": 0.5148, "step": 7130 }, { "epoch": 0.6928003104987386, "grad_norm": 1.7336491727001198, "learning_rate": 3.857810370562658e-05, "loss": 0.6154, "step": 7140 }, { "epoch": 0.6937706190568601, "grad_norm": 1.4164103938896966, "learning_rate": 3.856187942111753e-05, "loss": 0.5768, "step": 7150 }, { "epoch": 0.6947409276149815, "grad_norm": 1.5705135987803418, "learning_rate": 3.854565513660848e-05, "loss": 0.6102, "step": 7160 }, { "epoch": 0.695711236173103, "grad_norm": 1.4121262983361098, "learning_rate": 3.8529430852099424e-05, "loss": 0.61, "step": 7170 }, { "epoch": 0.6966815447312246, "grad_norm": 1.8472104212242206, "learning_rate": 3.851320656759037e-05, "loss": 0.5911, "step": 7180 }, { "epoch": 0.697651853289346, "grad_norm": 1.5344754452736826, "learning_rate": 3.8496982283081316e-05, "loss": 0.606, "step": 7190 }, { "epoch": 0.6986221618474675, "grad_norm": 1.6606427535357149, "learning_rate": 3.8480757998572266e-05, "loss": 0.6062, "step": 7200 }, { "epoch": 0.699592470405589, "grad_norm": 1.633183446097436, "learning_rate": 3.846453371406321e-05, "loss": 0.5828, "step": 7210 }, { "epoch": 0.7005627789637104, "grad_norm": 1.646557901144212, "learning_rate": 3.844830942955416e-05, "loss": 0.5354, "step": 7220 }, { "epoch": 0.701533087521832, "grad_norm": 1.64918421348089, "learning_rate": 3.84320851450451e-05, "loss": 0.551, "step": 7230 }, { "epoch": 0.7025033960799534, "grad_norm": 1.6382049443147468, "learning_rate": 3.841586086053605e-05, "loss": 0.5723, "step": 7240 }, { "epoch": 0.7034737046380749, "grad_norm": 2.1034428309614523, "learning_rate": 3.8399636576027e-05, "loss": 0.6282, "step": 7250 }, { "epoch": 0.7044440131961964, "grad_norm": 1.4993896668880777, "learning_rate": 3.8383412291517944e-05, "loss": 0.5221, "step": 7260 }, { "epoch": 0.7054143217543178, "grad_norm": 1.5357479402580956, "learning_rate": 3.8367188007008894e-05, "loss": 0.6176, "step": 7270 }, { "epoch": 0.7063846303124394, "grad_norm": 1.5289777074279678, "learning_rate": 3.835096372249984e-05, "loss": 0.6045, "step": 7280 }, { "epoch": 0.7073549388705609, "grad_norm": 1.6907343435757922, "learning_rate": 3.833473943799079e-05, "loss": 0.5778, "step": 7290 }, { "epoch": 0.7083252474286823, "grad_norm": 1.546188967929013, "learning_rate": 3.831851515348173e-05, "loss": 0.5966, "step": 7300 }, { "epoch": 0.7092955559868038, "grad_norm": 1.6668813308937025, "learning_rate": 3.830229086897268e-05, "loss": 0.5443, "step": 7310 }, { "epoch": 0.7102658645449252, "grad_norm": 2.0411822746490444, "learning_rate": 3.828606658446362e-05, "loss": 0.5663, "step": 7320 }, { "epoch": 0.7112361731030468, "grad_norm": 1.5825597761459882, "learning_rate": 3.826984229995457e-05, "loss": 0.6143, "step": 7330 }, { "epoch": 0.7122064816611683, "grad_norm": 1.7867885913406227, "learning_rate": 3.825361801544552e-05, "loss": 0.6094, "step": 7340 }, { "epoch": 0.7131767902192897, "grad_norm": 1.4415937345324663, "learning_rate": 3.8237393730936465e-05, "loss": 0.552, "step": 7350 }, { "epoch": 0.7141470987774112, "grad_norm": 1.8623916924146842, "learning_rate": 3.8221169446427415e-05, "loss": 0.6035, "step": 7360 }, { "epoch": 0.7151174073355328, "grad_norm": 1.6112921767840875, "learning_rate": 3.820494516191836e-05, "loss": 0.5957, "step": 7370 }, { "epoch": 0.7160877158936542, "grad_norm": 1.6295510822010462, "learning_rate": 3.818872087740931e-05, "loss": 0.5906, "step": 7380 }, { "epoch": 0.7170580244517757, "grad_norm": 1.3787962703695553, "learning_rate": 3.817249659290025e-05, "loss": 0.5884, "step": 7390 }, { "epoch": 0.7180283330098971, "grad_norm": 1.5293927693421874, "learning_rate": 3.81562723083912e-05, "loss": 0.6277, "step": 7400 }, { "epoch": 0.7189986415680186, "grad_norm": 1.4147252260121417, "learning_rate": 3.8140048023882143e-05, "loss": 0.5725, "step": 7410 }, { "epoch": 0.7199689501261402, "grad_norm": 1.7736403425897478, "learning_rate": 3.812382373937309e-05, "loss": 0.533, "step": 7420 }, { "epoch": 0.7209392586842616, "grad_norm": 1.642532067779172, "learning_rate": 3.810759945486404e-05, "loss": 0.6405, "step": 7430 }, { "epoch": 0.7219095672423831, "grad_norm": 1.8702737255141961, "learning_rate": 3.8091375170354986e-05, "loss": 0.523, "step": 7440 }, { "epoch": 0.7228798758005046, "grad_norm": 1.82343375200645, "learning_rate": 3.8075150885845936e-05, "loss": 0.54, "step": 7450 }, { "epoch": 0.723850184358626, "grad_norm": 1.6212132399265973, "learning_rate": 3.805892660133688e-05, "loss": 0.6016, "step": 7460 }, { "epoch": 0.7248204929167475, "grad_norm": 1.7707895342391216, "learning_rate": 3.804270231682783e-05, "loss": 0.5724, "step": 7470 }, { "epoch": 0.725790801474869, "grad_norm": 1.4832754109529855, "learning_rate": 3.802647803231877e-05, "loss": 0.5554, "step": 7480 }, { "epoch": 0.7267611100329905, "grad_norm": 1.4890517160303556, "learning_rate": 3.801025374780972e-05, "loss": 0.6065, "step": 7490 }, { "epoch": 0.727731418591112, "grad_norm": 2.047151127095432, "learning_rate": 3.799402946330067e-05, "loss": 0.6299, "step": 7500 }, { "epoch": 0.727731418591112, "eval_loss": 0.6740881204605103, "eval_runtime": 2470.6314, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.363, "step": 7500 }, { "epoch": 0.7287017271492334, "grad_norm": 1.6234544208509818, "learning_rate": 3.7977805178791614e-05, "loss": 0.6013, "step": 7510 }, { "epoch": 0.729672035707355, "grad_norm": 1.4452320512475898, "learning_rate": 3.7961580894282564e-05, "loss": 0.5479, "step": 7520 }, { "epoch": 0.7306423442654765, "grad_norm": 1.2290296761164277, "learning_rate": 3.794535660977351e-05, "loss": 0.5471, "step": 7530 }, { "epoch": 0.7316126528235979, "grad_norm": 1.567003648013224, "learning_rate": 3.792913232526446e-05, "loss": 0.6092, "step": 7540 }, { "epoch": 0.7325829613817194, "grad_norm": 1.6011455152210408, "learning_rate": 3.7912908040755406e-05, "loss": 0.629, "step": 7550 }, { "epoch": 0.7335532699398408, "grad_norm": 2.1218593111989192, "learning_rate": 3.7896683756246356e-05, "loss": 0.5773, "step": 7560 }, { "epoch": 0.7345235784979623, "grad_norm": 1.8323104555880425, "learning_rate": 3.78804594717373e-05, "loss": 0.618, "step": 7570 }, { "epoch": 0.7354938870560839, "grad_norm": 1.4152516707860787, "learning_rate": 3.786423518722825e-05, "loss": 0.6062, "step": 7580 }, { "epoch": 0.7364641956142053, "grad_norm": 1.6284692342846199, "learning_rate": 3.784801090271919e-05, "loss": 0.5671, "step": 7590 }, { "epoch": 0.7374345041723268, "grad_norm": 1.426043120608999, "learning_rate": 3.783178661821014e-05, "loss": 0.5972, "step": 7600 }, { "epoch": 0.7384048127304483, "grad_norm": 1.893183882199419, "learning_rate": 3.7815562333701085e-05, "loss": 0.6193, "step": 7610 }, { "epoch": 0.7393751212885697, "grad_norm": 1.9982181406611617, "learning_rate": 3.7799338049192034e-05, "loss": 0.5532, "step": 7620 }, { "epoch": 0.7403454298466913, "grad_norm": 1.8557864601006913, "learning_rate": 3.7783113764682984e-05, "loss": 0.5777, "step": 7630 }, { "epoch": 0.7413157384048127, "grad_norm": 1.8775040642513798, "learning_rate": 3.776688948017393e-05, "loss": 0.6268, "step": 7640 }, { "epoch": 0.7422860469629342, "grad_norm": 1.758574155734976, "learning_rate": 3.775066519566488e-05, "loss": 0.5662, "step": 7650 }, { "epoch": 0.7432563555210557, "grad_norm": 2.194684585403217, "learning_rate": 3.773444091115582e-05, "loss": 0.6628, "step": 7660 }, { "epoch": 0.7442266640791771, "grad_norm": 1.3673983887882022, "learning_rate": 3.771821662664677e-05, "loss": 0.6061, "step": 7670 }, { "epoch": 0.7451969726372987, "grad_norm": 1.2724099570692131, "learning_rate": 3.770199234213771e-05, "loss": 0.6337, "step": 7680 }, { "epoch": 0.7461672811954202, "grad_norm": 1.6823171665935568, "learning_rate": 3.768576805762866e-05, "loss": 0.5528, "step": 7690 }, { "epoch": 0.7471375897535416, "grad_norm": 1.9633396330523931, "learning_rate": 3.7669543773119605e-05, "loss": 0.5808, "step": 7700 }, { "epoch": 0.7481078983116631, "grad_norm": 1.7782111547784767, "learning_rate": 3.7653319488610555e-05, "loss": 0.5767, "step": 7710 }, { "epoch": 0.7490782068697845, "grad_norm": 2.3110549543336205, "learning_rate": 3.7637095204101505e-05, "loss": 0.6062, "step": 7720 }, { "epoch": 0.7500485154279061, "grad_norm": 1.6939270574842946, "learning_rate": 3.762087091959245e-05, "loss": 0.6225, "step": 7730 }, { "epoch": 0.7510188239860276, "grad_norm": 1.4820399391564874, "learning_rate": 3.76046466350834e-05, "loss": 0.6047, "step": 7740 }, { "epoch": 0.751989132544149, "grad_norm": 1.5672189262381615, "learning_rate": 3.758842235057434e-05, "loss": 0.5879, "step": 7750 }, { "epoch": 0.7529594411022705, "grad_norm": 1.6472834744978406, "learning_rate": 3.757219806606529e-05, "loss": 0.5895, "step": 7760 }, { "epoch": 0.7539297496603921, "grad_norm": 1.1201299864920753, "learning_rate": 3.7555973781556234e-05, "loss": 0.5801, "step": 7770 }, { "epoch": 0.7549000582185135, "grad_norm": 2.1519831187317107, "learning_rate": 3.753974949704718e-05, "loss": 0.5806, "step": 7780 }, { "epoch": 0.755870366776635, "grad_norm": 1.6558911877127844, "learning_rate": 3.752352521253813e-05, "loss": 0.6243, "step": 7790 }, { "epoch": 0.7568406753347564, "grad_norm": 1.56259604588081, "learning_rate": 3.7507300928029076e-05, "loss": 0.5611, "step": 7800 }, { "epoch": 0.7578109838928779, "grad_norm": 1.7228227668603993, "learning_rate": 3.7491076643520026e-05, "loss": 0.5517, "step": 7810 }, { "epoch": 0.7587812924509995, "grad_norm": 1.4902669117767953, "learning_rate": 3.747485235901097e-05, "loss": 0.538, "step": 7820 }, { "epoch": 0.7597516010091209, "grad_norm": 1.782224663749718, "learning_rate": 3.745862807450192e-05, "loss": 0.5753, "step": 7830 }, { "epoch": 0.7607219095672424, "grad_norm": 1.9997818457050736, "learning_rate": 3.744240378999286e-05, "loss": 0.6007, "step": 7840 }, { "epoch": 0.7616922181253639, "grad_norm": 1.4094657223176301, "learning_rate": 3.742617950548381e-05, "loss": 0.6338, "step": 7850 }, { "epoch": 0.7626625266834853, "grad_norm": 1.9288686398933608, "learning_rate": 3.7409955220974754e-05, "loss": 0.5906, "step": 7860 }, { "epoch": 0.7636328352416069, "grad_norm": 1.7390584642787001, "learning_rate": 3.7393730936465704e-05, "loss": 0.5738, "step": 7870 }, { "epoch": 0.7646031437997283, "grad_norm": 1.9438974814009289, "learning_rate": 3.7377506651956654e-05, "loss": 0.5121, "step": 7880 }, { "epoch": 0.7655734523578498, "grad_norm": 1.740068812702714, "learning_rate": 3.73612823674476e-05, "loss": 0.557, "step": 7890 }, { "epoch": 0.7665437609159713, "grad_norm": 1.7983844321630307, "learning_rate": 3.734505808293855e-05, "loss": 0.5748, "step": 7900 }, { "epoch": 0.7675140694740927, "grad_norm": 1.5464273991421298, "learning_rate": 3.732883379842949e-05, "loss": 0.547, "step": 7910 }, { "epoch": 0.7684843780322143, "grad_norm": 1.5357637162840818, "learning_rate": 3.731260951392044e-05, "loss": 0.5628, "step": 7920 }, { "epoch": 0.7694546865903358, "grad_norm": 1.5933643507985389, "learning_rate": 3.729638522941138e-05, "loss": 0.6377, "step": 7930 }, { "epoch": 0.7704249951484572, "grad_norm": 1.885102854313203, "learning_rate": 3.728016094490233e-05, "loss": 0.5542, "step": 7940 }, { "epoch": 0.7713953037065787, "grad_norm": 1.2646336337440816, "learning_rate": 3.7263936660393275e-05, "loss": 0.5909, "step": 7950 }, { "epoch": 0.7723656122647001, "grad_norm": 1.5489929838962764, "learning_rate": 3.7247712375884225e-05, "loss": 0.6417, "step": 7960 }, { "epoch": 0.7733359208228217, "grad_norm": 1.5642843810424312, "learning_rate": 3.7231488091375175e-05, "loss": 0.5739, "step": 7970 }, { "epoch": 0.7743062293809432, "grad_norm": 1.2994344306176584, "learning_rate": 3.721526380686612e-05, "loss": 0.6521, "step": 7980 }, { "epoch": 0.7752765379390646, "grad_norm": 1.7547352757346097, "learning_rate": 3.719903952235707e-05, "loss": 0.6055, "step": 7990 }, { "epoch": 0.7762468464971861, "grad_norm": 1.8680201530781706, "learning_rate": 3.718281523784801e-05, "loss": 0.5468, "step": 8000 }, { "epoch": 0.7762468464971861, "eval_loss": 0.6728888750076294, "eval_runtime": 2470.4642, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.363, "step": 8000 }, { "epoch": 0.7772171550553076, "grad_norm": 1.8353606131921394, "learning_rate": 3.716659095333896e-05, "loss": 0.6019, "step": 8010 }, { "epoch": 0.778187463613429, "grad_norm": 2.1165186332460624, "learning_rate": 3.71503666688299e-05, "loss": 0.6144, "step": 8020 }, { "epoch": 0.7791577721715506, "grad_norm": 1.2983357938097837, "learning_rate": 3.713414238432085e-05, "loss": 0.5765, "step": 8030 }, { "epoch": 0.780128080729672, "grad_norm": 1.6998652219779764, "learning_rate": 3.7117918099811796e-05, "loss": 0.5673, "step": 8040 }, { "epoch": 0.7810983892877935, "grad_norm": 1.6651384645995448, "learning_rate": 3.7101693815302746e-05, "loss": 0.5813, "step": 8050 }, { "epoch": 0.782068697845915, "grad_norm": 1.711156688730374, "learning_rate": 3.7085469530793696e-05, "loss": 0.5804, "step": 8060 }, { "epoch": 0.7830390064040365, "grad_norm": 1.8429084688690953, "learning_rate": 3.706924524628464e-05, "loss": 0.6076, "step": 8070 }, { "epoch": 0.784009314962158, "grad_norm": 2.029890621987729, "learning_rate": 3.705302096177559e-05, "loss": 0.6248, "step": 8080 }, { "epoch": 0.7849796235202795, "grad_norm": 2.075042801349425, "learning_rate": 3.703679667726653e-05, "loss": 0.5546, "step": 8090 }, { "epoch": 0.7859499320784009, "grad_norm": 1.9442644766970887, "learning_rate": 3.702057239275748e-05, "loss": 0.5754, "step": 8100 }, { "epoch": 0.7869202406365224, "grad_norm": 2.2898169839511167, "learning_rate": 3.7004348108248424e-05, "loss": 0.5479, "step": 8110 }, { "epoch": 0.7878905491946439, "grad_norm": 1.1971567816899984, "learning_rate": 3.6988123823739374e-05, "loss": 0.5841, "step": 8120 }, { "epoch": 0.7888608577527654, "grad_norm": 1.6483060799493618, "learning_rate": 3.697189953923032e-05, "loss": 0.5993, "step": 8130 }, { "epoch": 0.7898311663108869, "grad_norm": 2.1844480030037094, "learning_rate": 3.695567525472127e-05, "loss": 0.5639, "step": 8140 }, { "epoch": 0.7908014748690083, "grad_norm": 1.9208684488820418, "learning_rate": 3.6939450970212216e-05, "loss": 0.5548, "step": 8150 }, { "epoch": 0.7917717834271298, "grad_norm": 1.7298769058966599, "learning_rate": 3.692322668570316e-05, "loss": 0.6189, "step": 8160 }, { "epoch": 0.7927420919852514, "grad_norm": 2.043188779971351, "learning_rate": 3.690700240119411e-05, "loss": 0.6341, "step": 8170 }, { "epoch": 0.7937124005433728, "grad_norm": 1.4155988967270856, "learning_rate": 3.689077811668505e-05, "loss": 0.5803, "step": 8180 }, { "epoch": 0.7946827091014943, "grad_norm": 1.876027618508082, "learning_rate": 3.6874553832176e-05, "loss": 0.5469, "step": 8190 }, { "epoch": 0.7956530176596157, "grad_norm": 1.8526890916404788, "learning_rate": 3.6858329547666945e-05, "loss": 0.5692, "step": 8200 }, { "epoch": 0.7966233262177372, "grad_norm": 1.454859578114591, "learning_rate": 3.6842105263157895e-05, "loss": 0.5861, "step": 8210 }, { "epoch": 0.7975936347758588, "grad_norm": 1.6396970355230962, "learning_rate": 3.6825880978648844e-05, "loss": 0.5864, "step": 8220 }, { "epoch": 0.7985639433339802, "grad_norm": 2.035219184211101, "learning_rate": 3.680965669413979e-05, "loss": 0.5761, "step": 8230 }, { "epoch": 0.7995342518921017, "grad_norm": 1.4771264704118183, "learning_rate": 3.679343240963074e-05, "loss": 0.5901, "step": 8240 }, { "epoch": 0.8005045604502232, "grad_norm": 1.366640822196709, "learning_rate": 3.677720812512168e-05, "loss": 0.5976, "step": 8250 }, { "epoch": 0.8014748690083446, "grad_norm": 1.6158534552665804, "learning_rate": 3.676098384061263e-05, "loss": 0.5676, "step": 8260 }, { "epoch": 0.8024451775664662, "grad_norm": 1.906704908176893, "learning_rate": 3.674475955610357e-05, "loss": 0.6014, "step": 8270 }, { "epoch": 0.8034154861245876, "grad_norm": 1.72316238215741, "learning_rate": 3.672853527159452e-05, "loss": 0.5341, "step": 8280 }, { "epoch": 0.8043857946827091, "grad_norm": 1.6228489307580705, "learning_rate": 3.6712310987085466e-05, "loss": 0.6174, "step": 8290 }, { "epoch": 0.8053561032408306, "grad_norm": 1.6343652614101287, "learning_rate": 3.6696086702576416e-05, "loss": 0.5628, "step": 8300 }, { "epoch": 0.806326411798952, "grad_norm": 1.4875484854044787, "learning_rate": 3.6679862418067365e-05, "loss": 0.5559, "step": 8310 }, { "epoch": 0.8072967203570736, "grad_norm": 1.7360867259906014, "learning_rate": 3.6663638133558315e-05, "loss": 0.6122, "step": 8320 }, { "epoch": 0.8082670289151951, "grad_norm": 1.3760924603009739, "learning_rate": 3.664741384904926e-05, "loss": 0.5369, "step": 8330 }, { "epoch": 0.8092373374733165, "grad_norm": 2.1326347379272033, "learning_rate": 3.663118956454021e-05, "loss": 0.6026, "step": 8340 }, { "epoch": 0.810207646031438, "grad_norm": 1.8445530020541556, "learning_rate": 3.661496528003116e-05, "loss": 0.5924, "step": 8350 }, { "epoch": 0.8111779545895594, "grad_norm": 2.1620839328051153, "learning_rate": 3.65987409955221e-05, "loss": 0.5556, "step": 8360 }, { "epoch": 0.812148263147681, "grad_norm": 1.91334025126679, "learning_rate": 3.658251671101305e-05, "loss": 0.5787, "step": 8370 }, { "epoch": 0.8131185717058025, "grad_norm": 1.6197394739350461, "learning_rate": 3.656629242650399e-05, "loss": 0.5364, "step": 8380 }, { "epoch": 0.8140888802639239, "grad_norm": 1.7496855520727714, "learning_rate": 3.655006814199494e-05, "loss": 0.604, "step": 8390 }, { "epoch": 0.8150591888220454, "grad_norm": 1.9072833025355298, "learning_rate": 3.6533843857485886e-05, "loss": 0.6127, "step": 8400 }, { "epoch": 0.816029497380167, "grad_norm": 1.693494475477864, "learning_rate": 3.6517619572976836e-05, "loss": 0.5708, "step": 8410 }, { "epoch": 0.8169998059382884, "grad_norm": 1.694549481689151, "learning_rate": 3.6501395288467786e-05, "loss": 0.544, "step": 8420 }, { "epoch": 0.8179701144964099, "grad_norm": 1.77015793724743, "learning_rate": 3.648517100395873e-05, "loss": 0.5973, "step": 8430 }, { "epoch": 0.8189404230545313, "grad_norm": 1.738049084314835, "learning_rate": 3.646894671944968e-05, "loss": 0.5912, "step": 8440 }, { "epoch": 0.8199107316126528, "grad_norm": 1.647267019109384, "learning_rate": 3.645272243494062e-05, "loss": 0.6346, "step": 8450 }, { "epoch": 0.8208810401707743, "grad_norm": 1.5240403667548303, "learning_rate": 3.643649815043157e-05, "loss": 0.6057, "step": 8460 }, { "epoch": 0.8218513487288958, "grad_norm": 1.9527577485511822, "learning_rate": 3.6420273865922514e-05, "loss": 0.5615, "step": 8470 }, { "epoch": 0.8228216572870173, "grad_norm": 1.937531929878238, "learning_rate": 3.6404049581413464e-05, "loss": 0.5625, "step": 8480 }, { "epoch": 0.8237919658451388, "grad_norm": 1.7329982779170732, "learning_rate": 3.638782529690441e-05, "loss": 0.613, "step": 8490 }, { "epoch": 0.8247622744032602, "grad_norm": 1.880843027338221, "learning_rate": 3.637160101239536e-05, "loss": 0.5869, "step": 8500 }, { "epoch": 0.8247622744032602, "eval_loss": 0.6701070070266724, "eval_runtime": 2471.9751, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.362, "step": 8500 }, { "epoch": 0.8257325829613817, "grad_norm": 1.2566468431839666, "learning_rate": 3.6355376727886307e-05, "loss": 0.5433, "step": 8510 }, { "epoch": 0.8267028915195032, "grad_norm": 1.8546070309693423, "learning_rate": 3.633915244337725e-05, "loss": 0.5669, "step": 8520 }, { "epoch": 0.8276732000776247, "grad_norm": 1.268249173975751, "learning_rate": 3.63229281588682e-05, "loss": 0.5895, "step": 8530 }, { "epoch": 0.8286435086357462, "grad_norm": 1.7652424871651577, "learning_rate": 3.630670387435914e-05, "loss": 0.5597, "step": 8540 }, { "epoch": 0.8296138171938676, "grad_norm": 1.5480894985375517, "learning_rate": 3.629047958985009e-05, "loss": 0.5486, "step": 8550 }, { "epoch": 0.8305841257519891, "grad_norm": 1.750414852898824, "learning_rate": 3.6274255305341035e-05, "loss": 0.584, "step": 8560 }, { "epoch": 0.8315544343101107, "grad_norm": 1.7102419104322222, "learning_rate": 3.6258031020831985e-05, "loss": 0.5668, "step": 8570 }, { "epoch": 0.8325247428682321, "grad_norm": 1.5528462400057457, "learning_rate": 3.624180673632293e-05, "loss": 0.5896, "step": 8580 }, { "epoch": 0.8334950514263536, "grad_norm": 1.4801001203063542, "learning_rate": 3.622558245181388e-05, "loss": 0.5997, "step": 8590 }, { "epoch": 0.834465359984475, "grad_norm": 1.391432340520497, "learning_rate": 3.620935816730483e-05, "loss": 0.5795, "step": 8600 }, { "epoch": 0.8354356685425965, "grad_norm": 1.4373875976848383, "learning_rate": 3.619313388279577e-05, "loss": 0.5769, "step": 8610 }, { "epoch": 0.8364059771007181, "grad_norm": 1.8411413590399401, "learning_rate": 3.617690959828672e-05, "loss": 0.5626, "step": 8620 }, { "epoch": 0.8373762856588395, "grad_norm": 1.6990663233303336, "learning_rate": 3.616068531377766e-05, "loss": 0.6057, "step": 8630 }, { "epoch": 0.838346594216961, "grad_norm": 1.9737814861957261, "learning_rate": 3.614446102926861e-05, "loss": 0.564, "step": 8640 }, { "epoch": 0.8393169027750825, "grad_norm": 1.6163853470752478, "learning_rate": 3.6128236744759556e-05, "loss": 0.5429, "step": 8650 }, { "epoch": 0.8402872113332039, "grad_norm": 1.5911941575436375, "learning_rate": 3.6112012460250506e-05, "loss": 0.6203, "step": 8660 }, { "epoch": 0.8412575198913255, "grad_norm": 2.101242238346334, "learning_rate": 3.609578817574145e-05, "loss": 0.5263, "step": 8670 }, { "epoch": 0.8422278284494469, "grad_norm": 1.671876183792032, "learning_rate": 3.60795638912324e-05, "loss": 0.5699, "step": 8680 }, { "epoch": 0.8431981370075684, "grad_norm": 1.7729527985320428, "learning_rate": 3.606333960672335e-05, "loss": 0.5525, "step": 8690 }, { "epoch": 0.8441684455656899, "grad_norm": 1.7198189021299524, "learning_rate": 3.604711532221429e-05, "loss": 0.5789, "step": 8700 }, { "epoch": 0.8451387541238113, "grad_norm": 1.754160746828024, "learning_rate": 3.603089103770524e-05, "loss": 0.5126, "step": 8710 }, { "epoch": 0.8461090626819329, "grad_norm": 1.8621321453517432, "learning_rate": 3.6014666753196184e-05, "loss": 0.593, "step": 8720 }, { "epoch": 0.8470793712400544, "grad_norm": 1.4506925576933114, "learning_rate": 3.5998442468687134e-05, "loss": 0.5489, "step": 8730 }, { "epoch": 0.8480496797981758, "grad_norm": 1.9426709583278723, "learning_rate": 3.598221818417808e-05, "loss": 0.498, "step": 8740 }, { "epoch": 0.8490199883562973, "grad_norm": 1.7346244341640757, "learning_rate": 3.5965993899669026e-05, "loss": 0.5583, "step": 8750 }, { "epoch": 0.8499902969144187, "grad_norm": 1.926381813212122, "learning_rate": 3.594976961515997e-05, "loss": 0.5931, "step": 8760 }, { "epoch": 0.8509606054725403, "grad_norm": 1.9797943448639521, "learning_rate": 3.593354533065092e-05, "loss": 0.5773, "step": 8770 }, { "epoch": 0.8519309140306618, "grad_norm": 2.243092907709638, "learning_rate": 3.591732104614187e-05, "loss": 0.5862, "step": 8780 }, { "epoch": 0.8529012225887832, "grad_norm": 1.6686507496640315, "learning_rate": 3.590109676163281e-05, "loss": 0.5861, "step": 8790 }, { "epoch": 0.8538715311469047, "grad_norm": 1.767743857045935, "learning_rate": 3.588487247712376e-05, "loss": 0.6008, "step": 8800 }, { "epoch": 0.8548418397050263, "grad_norm": 1.789840488197533, "learning_rate": 3.5868648192614705e-05, "loss": 0.5578, "step": 8810 }, { "epoch": 0.8558121482631477, "grad_norm": 1.826043320699609, "learning_rate": 3.5852423908105655e-05, "loss": 0.5568, "step": 8820 }, { "epoch": 0.8567824568212692, "grad_norm": 1.4847524529624125, "learning_rate": 3.58361996235966e-05, "loss": 0.5984, "step": 8830 }, { "epoch": 0.8577527653793906, "grad_norm": 1.5302240137475795, "learning_rate": 3.581997533908755e-05, "loss": 0.5619, "step": 8840 }, { "epoch": 0.8587230739375121, "grad_norm": 1.6025870419513641, "learning_rate": 3.58037510545785e-05, "loss": 0.601, "step": 8850 }, { "epoch": 0.8596933824956337, "grad_norm": 1.8930896480563768, "learning_rate": 3.578752677006944e-05, "loss": 0.5935, "step": 8860 }, { "epoch": 0.8606636910537551, "grad_norm": 1.6413896209556986, "learning_rate": 3.577130248556039e-05, "loss": 0.6019, "step": 8870 }, { "epoch": 0.8616339996118766, "grad_norm": 1.6030981794189743, "learning_rate": 3.575507820105133e-05, "loss": 0.5761, "step": 8880 }, { "epoch": 0.8626043081699981, "grad_norm": 1.4798866694040977, "learning_rate": 3.573885391654228e-05, "loss": 0.5561, "step": 8890 }, { "epoch": 0.8635746167281195, "grad_norm": 1.5790323051766768, "learning_rate": 3.5722629632033226e-05, "loss": 0.5986, "step": 8900 }, { "epoch": 0.864544925286241, "grad_norm": 1.8409121182605548, "learning_rate": 3.5706405347524175e-05, "loss": 0.5878, "step": 8910 }, { "epoch": 0.8655152338443625, "grad_norm": 1.4956048954650922, "learning_rate": 3.569018106301512e-05, "loss": 0.5548, "step": 8920 }, { "epoch": 0.866485542402484, "grad_norm": 1.6969221289264929, "learning_rate": 3.567395677850607e-05, "loss": 0.5937, "step": 8930 }, { "epoch": 0.8674558509606055, "grad_norm": 1.8986285895813184, "learning_rate": 3.565773249399702e-05, "loss": 0.5742, "step": 8940 }, { "epoch": 0.8684261595187269, "grad_norm": 1.5434045122281053, "learning_rate": 3.564150820948796e-05, "loss": 0.561, "step": 8950 }, { "epoch": 0.8693964680768484, "grad_norm": 1.69523497453855, "learning_rate": 3.562528392497891e-05, "loss": 0.5386, "step": 8960 }, { "epoch": 0.87036677663497, "grad_norm": 1.382817285428295, "learning_rate": 3.5609059640469854e-05, "loss": 0.555, "step": 8970 }, { "epoch": 0.8713370851930914, "grad_norm": 1.7354010988203836, "learning_rate": 3.5592835355960803e-05, "loss": 0.5963, "step": 8980 }, { "epoch": 0.8723073937512129, "grad_norm": 1.9761497522561469, "learning_rate": 3.5576611071451746e-05, "loss": 0.5859, "step": 8990 }, { "epoch": 0.8732777023093343, "grad_norm": 1.6264062899083487, "learning_rate": 3.5560386786942696e-05, "loss": 0.5596, "step": 9000 }, { "epoch": 0.8732777023093343, "eval_loss": 0.664995551109314, "eval_runtime": 2472.6792, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.362, "step": 9000 }, { "epoch": 0.8742480108674558, "grad_norm": 1.7886226308798359, "learning_rate": 3.554416250243364e-05, "loss": 0.5484, "step": 9010 }, { "epoch": 0.8752183194255774, "grad_norm": 1.5768009291424698, "learning_rate": 3.552793821792459e-05, "loss": 0.5511, "step": 9020 }, { "epoch": 0.8761886279836988, "grad_norm": 1.5632964481761753, "learning_rate": 3.551171393341554e-05, "loss": 0.5173, "step": 9030 }, { "epoch": 0.8771589365418203, "grad_norm": 2.0563484372782264, "learning_rate": 3.549548964890648e-05, "loss": 0.5586, "step": 9040 }, { "epoch": 0.8781292450999418, "grad_norm": 1.9786114879020535, "learning_rate": 3.547926536439743e-05, "loss": 0.5767, "step": 9050 }, { "epoch": 0.8790995536580632, "grad_norm": 1.2143656544779613, "learning_rate": 3.5463041079888375e-05, "loss": 0.5469, "step": 9060 }, { "epoch": 0.8800698622161848, "grad_norm": 1.696976506773579, "learning_rate": 3.5446816795379324e-05, "loss": 0.5819, "step": 9070 }, { "epoch": 0.8810401707743062, "grad_norm": 1.8002454640393335, "learning_rate": 3.543059251087027e-05, "loss": 0.5947, "step": 9080 }, { "epoch": 0.8820104793324277, "grad_norm": 1.826272101277873, "learning_rate": 3.541436822636122e-05, "loss": 0.632, "step": 9090 }, { "epoch": 0.8829807878905492, "grad_norm": 1.8609691197855038, "learning_rate": 3.539814394185217e-05, "loss": 0.649, "step": 9100 }, { "epoch": 0.8839510964486706, "grad_norm": 1.7165708438826977, "learning_rate": 3.5381919657343117e-05, "loss": 0.5746, "step": 9110 }, { "epoch": 0.8849214050067922, "grad_norm": 2.0390486982174454, "learning_rate": 3.536569537283406e-05, "loss": 0.5834, "step": 9120 }, { "epoch": 0.8858917135649137, "grad_norm": 1.3107579501174733, "learning_rate": 3.534947108832501e-05, "loss": 0.5605, "step": 9130 }, { "epoch": 0.8868620221230351, "grad_norm": 1.7750714401302556, "learning_rate": 3.533324680381596e-05, "loss": 0.5778, "step": 9140 }, { "epoch": 0.8878323306811566, "grad_norm": 1.9002696712889475, "learning_rate": 3.53170225193069e-05, "loss": 0.5928, "step": 9150 }, { "epoch": 0.888802639239278, "grad_norm": 1.602034501274119, "learning_rate": 3.530079823479785e-05, "loss": 0.5941, "step": 9160 }, { "epoch": 0.8897729477973996, "grad_norm": 1.5338224625276715, "learning_rate": 3.5284573950288795e-05, "loss": 0.5678, "step": 9170 }, { "epoch": 0.8907432563555211, "grad_norm": 1.6656771631689804, "learning_rate": 3.5268349665779745e-05, "loss": 0.5621, "step": 9180 }, { "epoch": 0.8917135649136425, "grad_norm": 1.6734086655118368, "learning_rate": 3.525212538127069e-05, "loss": 0.5836, "step": 9190 }, { "epoch": 0.892683873471764, "grad_norm": 1.7321566684016627, "learning_rate": 3.523590109676164e-05, "loss": 0.6049, "step": 9200 }, { "epoch": 0.8936541820298856, "grad_norm": 1.8468990646584322, "learning_rate": 3.521967681225258e-05, "loss": 0.5265, "step": 9210 }, { "epoch": 0.894624490588007, "grad_norm": 1.8260102068484456, "learning_rate": 3.520345252774353e-05, "loss": 0.6003, "step": 9220 }, { "epoch": 0.8955947991461285, "grad_norm": 1.5049364704966368, "learning_rate": 3.518722824323448e-05, "loss": 0.6362, "step": 9230 }, { "epoch": 0.8965651077042499, "grad_norm": 1.8567290478944525, "learning_rate": 3.517100395872542e-05, "loss": 0.5666, "step": 9240 }, { "epoch": 0.8975354162623714, "grad_norm": 1.83554173576104, "learning_rate": 3.515477967421637e-05, "loss": 0.525, "step": 9250 }, { "epoch": 0.898505724820493, "grad_norm": 1.756580666977339, "learning_rate": 3.5138555389707316e-05, "loss": 0.5297, "step": 9260 }, { "epoch": 0.8994760333786144, "grad_norm": 1.3709870685247603, "learning_rate": 3.5122331105198265e-05, "loss": 0.5529, "step": 9270 }, { "epoch": 0.9004463419367359, "grad_norm": 1.910129116074302, "learning_rate": 3.510610682068921e-05, "loss": 0.5575, "step": 9280 }, { "epoch": 0.9014166504948574, "grad_norm": 1.457636418218358, "learning_rate": 3.508988253618016e-05, "loss": 0.6219, "step": 9290 }, { "epoch": 0.9023869590529788, "grad_norm": 1.7400018831329018, "learning_rate": 3.50736582516711e-05, "loss": 0.597, "step": 9300 }, { "epoch": 0.9033572676111004, "grad_norm": 1.7711972448898297, "learning_rate": 3.505743396716205e-05, "loss": 0.5953, "step": 9310 }, { "epoch": 0.9043275761692218, "grad_norm": 1.3127391028092956, "learning_rate": 3.5041209682653e-05, "loss": 0.5771, "step": 9320 }, { "epoch": 0.9052978847273433, "grad_norm": 1.707083390377331, "learning_rate": 3.5024985398143944e-05, "loss": 0.5941, "step": 9330 }, { "epoch": 0.9062681932854648, "grad_norm": 2.3395444029475425, "learning_rate": 3.5008761113634894e-05, "loss": 0.5249, "step": 9340 }, { "epoch": 0.9072385018435862, "grad_norm": 2.15851320522422, "learning_rate": 3.4992536829125837e-05, "loss": 0.5754, "step": 9350 }, { "epoch": 0.9082088104017078, "grad_norm": 1.566455099295683, "learning_rate": 3.4976312544616786e-05, "loss": 0.5982, "step": 9360 }, { "epoch": 0.9091791189598293, "grad_norm": 1.6447262177778976, "learning_rate": 3.496008826010773e-05, "loss": 0.5819, "step": 9370 }, { "epoch": 0.9101494275179507, "grad_norm": 1.9281752331049982, "learning_rate": 3.494386397559868e-05, "loss": 0.5789, "step": 9380 }, { "epoch": 0.9111197360760722, "grad_norm": 1.9237214019216216, "learning_rate": 3.492763969108962e-05, "loss": 0.5773, "step": 9390 }, { "epoch": 0.9120900446341936, "grad_norm": 1.5099620370471458, "learning_rate": 3.491141540658057e-05, "loss": 0.5954, "step": 9400 }, { "epoch": 0.9130603531923152, "grad_norm": 1.861214942110368, "learning_rate": 3.489519112207152e-05, "loss": 0.5956, "step": 9410 }, { "epoch": 0.9140306617504367, "grad_norm": 1.780788157697634, "learning_rate": 3.4878966837562465e-05, "loss": 0.5492, "step": 9420 }, { "epoch": 0.9150009703085581, "grad_norm": 1.7391155816623414, "learning_rate": 3.4862742553053414e-05, "loss": 0.5984, "step": 9430 }, { "epoch": 0.9159712788666796, "grad_norm": 2.1229665459752125, "learning_rate": 3.484651826854436e-05, "loss": 0.5924, "step": 9440 }, { "epoch": 0.9169415874248011, "grad_norm": 2.1306613515387625, "learning_rate": 3.483029398403531e-05, "loss": 0.5148, "step": 9450 }, { "epoch": 0.9179118959829226, "grad_norm": 1.312566076533743, "learning_rate": 3.481406969952625e-05, "loss": 0.5919, "step": 9460 }, { "epoch": 0.9188822045410441, "grad_norm": 1.4886826509371758, "learning_rate": 3.47978454150172e-05, "loss": 0.5853, "step": 9470 }, { "epoch": 0.9198525130991655, "grad_norm": 2.0143652338550098, "learning_rate": 3.478162113050815e-05, "loss": 0.5689, "step": 9480 }, { "epoch": 0.920822821657287, "grad_norm": 1.1368414203141723, "learning_rate": 3.476539684599909e-05, "loss": 0.5546, "step": 9490 }, { "epoch": 0.9217931302154085, "grad_norm": 1.6341362563091877, "learning_rate": 3.474917256149004e-05, "loss": 0.5481, "step": 9500 }, { "epoch": 0.9217931302154085, "eval_loss": 0.6622401475906372, "eval_runtime": 2474.1087, "eval_samples_per_second": 0.724, "eval_steps_per_second": 0.362, "step": 9500 }, { "epoch": 0.92276343877353, "grad_norm": 1.8501035490125046, "learning_rate": 3.4732948276980985e-05, "loss": 0.519, "step": 9510 }, { "epoch": 0.9237337473316515, "grad_norm": 1.788403250607821, "learning_rate": 3.4716723992471935e-05, "loss": 0.5934, "step": 9520 }, { "epoch": 0.924704055889773, "grad_norm": 2.0089841746536212, "learning_rate": 3.470049970796288e-05, "loss": 0.5463, "step": 9530 }, { "epoch": 0.9256743644478944, "grad_norm": 1.4797021959834935, "learning_rate": 3.468427542345383e-05, "loss": 0.538, "step": 9540 }, { "epoch": 0.9266446730060159, "grad_norm": 1.5018994978955122, "learning_rate": 3.466805113894477e-05, "loss": 0.5859, "step": 9550 }, { "epoch": 0.9276149815641374, "grad_norm": 1.6445575492561615, "learning_rate": 3.465182685443572e-05, "loss": 0.5335, "step": 9560 }, { "epoch": 0.9285852901222589, "grad_norm": 1.910218513281919, "learning_rate": 3.463560256992667e-05, "loss": 0.5284, "step": 9570 }, { "epoch": 0.9295555986803804, "grad_norm": 1.5061058308088753, "learning_rate": 3.4619378285417614e-05, "loss": 0.5332, "step": 9580 }, { "epoch": 0.9305259072385018, "grad_norm": 1.5869569364806828, "learning_rate": 3.460315400090856e-05, "loss": 0.5591, "step": 9590 }, { "epoch": 0.9314962157966233, "grad_norm": 1.709001551959916, "learning_rate": 3.4586929716399506e-05, "loss": 0.5499, "step": 9600 }, { "epoch": 0.9324665243547449, "grad_norm": 1.5648665735772118, "learning_rate": 3.4570705431890456e-05, "loss": 0.5677, "step": 9610 }, { "epoch": 0.9334368329128663, "grad_norm": 2.297106138114182, "learning_rate": 3.45544811473814e-05, "loss": 0.5687, "step": 9620 }, { "epoch": 0.9344071414709878, "grad_norm": 1.940344115414216, "learning_rate": 3.453825686287235e-05, "loss": 0.5614, "step": 9630 }, { "epoch": 0.9353774500291092, "grad_norm": 2.2931746226047336, "learning_rate": 3.452203257836329e-05, "loss": 0.5888, "step": 9640 }, { "epoch": 0.9363477585872307, "grad_norm": 1.6726396768074983, "learning_rate": 3.450580829385424e-05, "loss": 0.555, "step": 9650 }, { "epoch": 0.9373180671453523, "grad_norm": 1.5245738077800575, "learning_rate": 3.448958400934519e-05, "loss": 0.5464, "step": 9660 }, { "epoch": 0.9382883757034737, "grad_norm": 1.3863585051832457, "learning_rate": 3.4473359724836134e-05, "loss": 0.5033, "step": 9670 }, { "epoch": 0.9392586842615952, "grad_norm": 1.2261387961871664, "learning_rate": 3.4457135440327084e-05, "loss": 0.5229, "step": 9680 }, { "epoch": 0.9402289928197167, "grad_norm": 1.8933274253957586, "learning_rate": 3.444091115581803e-05, "loss": 0.552, "step": 9690 }, { "epoch": 0.9411993013778381, "grad_norm": 1.7235797326078635, "learning_rate": 3.442468687130898e-05, "loss": 0.5899, "step": 9700 }, { "epoch": 0.9421696099359597, "grad_norm": 1.428965938918239, "learning_rate": 3.440846258679992e-05, "loss": 0.5733, "step": 9710 }, { "epoch": 0.9431399184940811, "grad_norm": 1.4190853376920558, "learning_rate": 3.439223830229087e-05, "loss": 0.5762, "step": 9720 }, { "epoch": 0.9441102270522026, "grad_norm": 1.569528489090731, "learning_rate": 3.437601401778181e-05, "loss": 0.5535, "step": 9730 }, { "epoch": 0.9450805356103241, "grad_norm": 1.869896863596011, "learning_rate": 3.435978973327276e-05, "loss": 0.5651, "step": 9740 }, { "epoch": 0.9460508441684455, "grad_norm": 2.3801435462427785, "learning_rate": 3.434356544876371e-05, "loss": 0.5366, "step": 9750 }, { "epoch": 0.9470211527265671, "grad_norm": 1.7543924621581104, "learning_rate": 3.4327341164254655e-05, "loss": 0.5407, "step": 9760 }, { "epoch": 0.9479914612846886, "grad_norm": 1.6645160945117223, "learning_rate": 3.4311116879745605e-05, "loss": 0.5776, "step": 9770 }, { "epoch": 0.94896176984281, "grad_norm": 1.5226969850196896, "learning_rate": 3.429489259523655e-05, "loss": 0.5189, "step": 9780 }, { "epoch": 0.9499320784009315, "grad_norm": 1.7957341969322531, "learning_rate": 3.42786683107275e-05, "loss": 0.5771, "step": 9790 }, { "epoch": 0.9509023869590529, "grad_norm": 1.4922624955935235, "learning_rate": 3.426244402621844e-05, "loss": 0.5339, "step": 9800 }, { "epoch": 0.9518726955171745, "grad_norm": 1.9179842292383775, "learning_rate": 3.424621974170939e-05, "loss": 0.5408, "step": 9810 }, { "epoch": 0.952843004075296, "grad_norm": 1.8930907076270356, "learning_rate": 3.4229995457200333e-05, "loss": 0.5622, "step": 9820 }, { "epoch": 0.9538133126334174, "grad_norm": 1.781854001943581, "learning_rate": 3.421377117269128e-05, "loss": 0.5654, "step": 9830 }, { "epoch": 0.9547836211915389, "grad_norm": 1.9058249916201926, "learning_rate": 3.419754688818223e-05, "loss": 0.6089, "step": 9840 }, { "epoch": 0.9557539297496604, "grad_norm": 1.5813398061915347, "learning_rate": 3.4181322603673176e-05, "loss": 0.5444, "step": 9850 }, { "epoch": 0.9567242383077819, "grad_norm": 1.6480219585268394, "learning_rate": 3.4165098319164126e-05, "loss": 0.5215, "step": 9860 }, { "epoch": 0.9576945468659034, "grad_norm": 1.6302754198626406, "learning_rate": 3.414887403465507e-05, "loss": 0.5225, "step": 9870 }, { "epoch": 0.9586648554240248, "grad_norm": 1.9428020773502297, "learning_rate": 3.4132649750146025e-05, "loss": 0.5227, "step": 9880 }, { "epoch": 0.9596351639821463, "grad_norm": 2.1394044994306376, "learning_rate": 3.411642546563697e-05, "loss": 0.5227, "step": 9890 }, { "epoch": 0.9606054725402678, "grad_norm": 1.4958041091695313, "learning_rate": 3.410020118112792e-05, "loss": 0.5536, "step": 9900 }, { "epoch": 0.9615757810983893, "grad_norm": 1.65783511931855, "learning_rate": 3.408397689661886e-05, "loss": 0.6116, "step": 9910 }, { "epoch": 0.9625460896565108, "grad_norm": 1.2687727005359897, "learning_rate": 3.406775261210981e-05, "loss": 0.582, "step": 9920 }, { "epoch": 0.9635163982146323, "grad_norm": 1.642309856430725, "learning_rate": 3.4051528327600754e-05, "loss": 0.5156, "step": 9930 }, { "epoch": 0.9644867067727537, "grad_norm": 1.5540151182331825, "learning_rate": 3.4035304043091704e-05, "loss": 0.514, "step": 9940 }, { "epoch": 0.9654570153308752, "grad_norm": 1.6334411063744383, "learning_rate": 3.401907975858265e-05, "loss": 0.585, "step": 9950 }, { "epoch": 0.9664273238889967, "grad_norm": 1.5262322683208274, "learning_rate": 3.4002855474073596e-05, "loss": 0.5493, "step": 9960 }, { "epoch": 0.9673976324471182, "grad_norm": 2.041216469634578, "learning_rate": 3.3986631189564546e-05, "loss": 0.5701, "step": 9970 }, { "epoch": 0.9683679410052397, "grad_norm": 1.6826626866998198, "learning_rate": 3.397040690505549e-05, "loss": 0.5522, "step": 9980 }, { "epoch": 0.9693382495633611, "grad_norm": 1.3784779820091337, "learning_rate": 3.395418262054644e-05, "loss": 0.5401, "step": 9990 }, { "epoch": 0.9703085581214826, "grad_norm": 2.164538127382688, "learning_rate": 3.393795833603738e-05, "loss": 0.5493, "step": 10000 }, { "epoch": 0.9703085581214826, "eval_loss": 0.6604536771774292, "eval_runtime": 2471.8323, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.362, "step": 10000 }, { "epoch": 0.9712788666796042, "grad_norm": 1.641886541364288, "learning_rate": 3.392173405152833e-05, "loss": 0.5592, "step": 10010 }, { "epoch": 0.9722491752377256, "grad_norm": 1.676600577801569, "learning_rate": 3.3905509767019275e-05, "loss": 0.5675, "step": 10020 }, { "epoch": 0.9732194837958471, "grad_norm": 1.575522175535914, "learning_rate": 3.3889285482510224e-05, "loss": 0.5564, "step": 10030 }, { "epoch": 0.9741897923539685, "grad_norm": 1.4256611852011571, "learning_rate": 3.3873061198001174e-05, "loss": 0.5654, "step": 10040 }, { "epoch": 0.97516010091209, "grad_norm": 1.9766555201121603, "learning_rate": 3.385683691349212e-05, "loss": 0.5853, "step": 10050 }, { "epoch": 0.9761304094702116, "grad_norm": 1.4257493175991105, "learning_rate": 3.384061262898307e-05, "loss": 0.602, "step": 10060 }, { "epoch": 0.977100718028333, "grad_norm": 1.7942732122646, "learning_rate": 3.382438834447401e-05, "loss": 0.5083, "step": 10070 }, { "epoch": 0.9780710265864545, "grad_norm": 1.970290799031593, "learning_rate": 3.380816405996496e-05, "loss": 0.5353, "step": 10080 }, { "epoch": 0.979041335144576, "grad_norm": 2.177029103778447, "learning_rate": 3.37919397754559e-05, "loss": 0.5364, "step": 10090 }, { "epoch": 0.9800116437026974, "grad_norm": 1.6319237609838204, "learning_rate": 3.377571549094685e-05, "loss": 0.5498, "step": 10100 }, { "epoch": 0.980981952260819, "grad_norm": 1.9418003047270103, "learning_rate": 3.3759491206437795e-05, "loss": 0.6225, "step": 10110 }, { "epoch": 0.9819522608189404, "grad_norm": 1.5233382574211756, "learning_rate": 3.3743266921928745e-05, "loss": 0.5488, "step": 10120 }, { "epoch": 0.9829225693770619, "grad_norm": 1.7034150218696569, "learning_rate": 3.3727042637419695e-05, "loss": 0.5912, "step": 10130 }, { "epoch": 0.9838928779351834, "grad_norm": 1.9808892841805323, "learning_rate": 3.371081835291064e-05, "loss": 0.5657, "step": 10140 }, { "epoch": 0.9848631864933048, "grad_norm": 1.9429830219605533, "learning_rate": 3.369459406840159e-05, "loss": 0.563, "step": 10150 }, { "epoch": 0.9858334950514264, "grad_norm": 1.7823608925927663, "learning_rate": 3.367836978389253e-05, "loss": 0.5315, "step": 10160 }, { "epoch": 0.9868038036095479, "grad_norm": 1.6317868773777158, "learning_rate": 3.366214549938348e-05, "loss": 0.5663, "step": 10170 }, { "epoch": 0.9877741121676693, "grad_norm": 1.9950884655573202, "learning_rate": 3.3645921214874424e-05, "loss": 0.5704, "step": 10180 }, { "epoch": 0.9887444207257908, "grad_norm": 1.6320762630806733, "learning_rate": 3.362969693036537e-05, "loss": 0.5746, "step": 10190 }, { "epoch": 0.9897147292839122, "grad_norm": 1.685041060441873, "learning_rate": 3.361347264585632e-05, "loss": 0.5917, "step": 10200 }, { "epoch": 0.9906850378420338, "grad_norm": 1.7530117501023248, "learning_rate": 3.3597248361347266e-05, "loss": 0.5365, "step": 10210 }, { "epoch": 0.9916553464001553, "grad_norm": 1.9196791146989973, "learning_rate": 3.3581024076838216e-05, "loss": 0.5795, "step": 10220 }, { "epoch": 0.9926256549582767, "grad_norm": 1.9870737501998446, "learning_rate": 3.356479979232916e-05, "loss": 0.6544, "step": 10230 }, { "epoch": 0.9935959635163982, "grad_norm": 1.491102870770748, "learning_rate": 3.354857550782011e-05, "loss": 0.5083, "step": 10240 }, { "epoch": 0.9945662720745198, "grad_norm": 1.5900809359608934, "learning_rate": 3.353235122331105e-05, "loss": 0.5693, "step": 10250 }, { "epoch": 0.9955365806326412, "grad_norm": 1.6635095304395011, "learning_rate": 3.3516126938802e-05, "loss": 0.5744, "step": 10260 }, { "epoch": 0.9965068891907627, "grad_norm": 2.0049230325912957, "learning_rate": 3.3499902654292944e-05, "loss": 0.5461, "step": 10270 }, { "epoch": 0.9974771977488841, "grad_norm": 1.50147581851131, "learning_rate": 3.3483678369783894e-05, "loss": 0.5803, "step": 10280 }, { "epoch": 0.9984475063070056, "grad_norm": 1.8064338359868768, "learning_rate": 3.3467454085274844e-05, "loss": 0.506, "step": 10290 }, { "epoch": 0.9994178148651272, "grad_norm": 2.052231872791701, "learning_rate": 3.345122980076579e-05, "loss": 0.5752, "step": 10300 }, { "epoch": 1.0003881234232486, "grad_norm": 1.7418958102493116, "learning_rate": 3.343500551625674e-05, "loss": 0.6082, "step": 10310 }, { "epoch": 1.00135843198137, "grad_norm": 1.8975860607542987, "learning_rate": 3.341878123174768e-05, "loss": 0.5524, "step": 10320 }, { "epoch": 1.0023287405394916, "grad_norm": 1.7076807320811012, "learning_rate": 3.340255694723863e-05, "loss": 0.6075, "step": 10330 }, { "epoch": 1.003299049097613, "grad_norm": 1.4300451205657956, "learning_rate": 3.338633266272957e-05, "loss": 0.5326, "step": 10340 }, { "epoch": 1.0042693576557344, "grad_norm": 2.0682797223020777, "learning_rate": 3.337010837822052e-05, "loss": 0.5484, "step": 10350 }, { "epoch": 1.005239666213856, "grad_norm": 1.536692096590345, "learning_rate": 3.3353884093711465e-05, "loss": 0.546, "step": 10360 }, { "epoch": 1.0062099747719775, "grad_norm": 1.7848861749442593, "learning_rate": 3.3337659809202415e-05, "loss": 0.5763, "step": 10370 }, { "epoch": 1.007180283330099, "grad_norm": 1.4925347515246201, "learning_rate": 3.3321435524693365e-05, "loss": 0.5005, "step": 10380 }, { "epoch": 1.0081505918882205, "grad_norm": 1.4245048085109102, "learning_rate": 3.330521124018431e-05, "loss": 0.5352, "step": 10390 }, { "epoch": 1.009120900446342, "grad_norm": 1.6528542775713155, "learning_rate": 3.328898695567526e-05, "loss": 0.5794, "step": 10400 }, { "epoch": 1.0100912090044634, "grad_norm": 1.7391134864648952, "learning_rate": 3.32727626711662e-05, "loss": 0.6133, "step": 10410 }, { "epoch": 1.011061517562585, "grad_norm": 1.8040782879083466, "learning_rate": 3.325653838665715e-05, "loss": 0.5124, "step": 10420 }, { "epoch": 1.0120318261207064, "grad_norm": 1.730132568689756, "learning_rate": 3.324031410214809e-05, "loss": 0.4803, "step": 10430 }, { "epoch": 1.0130021346788278, "grad_norm": 1.8540575748734034, "learning_rate": 3.322408981763904e-05, "loss": 0.5188, "step": 10440 }, { "epoch": 1.0139724432369492, "grad_norm": 1.7683659307789739, "learning_rate": 3.3207865533129986e-05, "loss": 0.5341, "step": 10450 }, { "epoch": 1.0149427517950709, "grad_norm": 1.7745239019667731, "learning_rate": 3.3191641248620936e-05, "loss": 0.5858, "step": 10460 }, { "epoch": 1.0159130603531923, "grad_norm": 1.5664467690196044, "learning_rate": 3.3175416964111886e-05, "loss": 0.6587, "step": 10470 }, { "epoch": 1.0168833689113137, "grad_norm": 1.92833985335436, "learning_rate": 3.315919267960283e-05, "loss": 0.5532, "step": 10480 }, { "epoch": 1.0178536774694353, "grad_norm": 1.646924763771934, "learning_rate": 3.314296839509378e-05, "loss": 0.5638, "step": 10490 }, { "epoch": 1.0188239860275567, "grad_norm": 1.477844051399051, "learning_rate": 3.312674411058472e-05, "loss": 0.5254, "step": 10500 }, { "epoch": 1.0188239860275567, "eval_loss": 0.6569487452507019, "eval_runtime": 2472.3859, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.362, "step": 10500 }, { "epoch": 1.0197942945856782, "grad_norm": 1.7732534556121307, "learning_rate": 3.311051982607567e-05, "loss": 0.5098, "step": 10510 }, { "epoch": 1.0207646031437998, "grad_norm": 1.8592910812190746, "learning_rate": 3.3094295541566614e-05, "loss": 0.5836, "step": 10520 }, { "epoch": 1.0217349117019212, "grad_norm": 1.7186074858012894, "learning_rate": 3.3078071257057564e-05, "loss": 0.5445, "step": 10530 }, { "epoch": 1.0227052202600426, "grad_norm": 2.043653022917622, "learning_rate": 3.306184697254851e-05, "loss": 0.5627, "step": 10540 }, { "epoch": 1.0236755288181643, "grad_norm": 1.7054473164163695, "learning_rate": 3.304562268803946e-05, "loss": 0.6426, "step": 10550 }, { "epoch": 1.0246458373762857, "grad_norm": 1.5920397197878904, "learning_rate": 3.3029398403530406e-05, "loss": 0.5542, "step": 10560 }, { "epoch": 1.025616145934407, "grad_norm": 1.6885876369635333, "learning_rate": 3.301317411902135e-05, "loss": 0.5399, "step": 10570 }, { "epoch": 1.0265864544925287, "grad_norm": 1.712761359550596, "learning_rate": 3.29969498345123e-05, "loss": 0.5609, "step": 10580 }, { "epoch": 1.0275567630506501, "grad_norm": 1.849118205679511, "learning_rate": 3.298072555000324e-05, "loss": 0.5817, "step": 10590 }, { "epoch": 1.0285270716087715, "grad_norm": 1.7919320749353798, "learning_rate": 3.296450126549419e-05, "loss": 0.562, "step": 10600 }, { "epoch": 1.0294973801668932, "grad_norm": 1.9867833035377922, "learning_rate": 3.2948276980985135e-05, "loss": 0.5266, "step": 10610 }, { "epoch": 1.0304676887250146, "grad_norm": 1.4844422926334193, "learning_rate": 3.2932052696476085e-05, "loss": 0.5989, "step": 10620 }, { "epoch": 1.031437997283136, "grad_norm": 1.902397349373601, "learning_rate": 3.2915828411967034e-05, "loss": 0.4869, "step": 10630 }, { "epoch": 1.0324083058412574, "grad_norm": 1.595731562090853, "learning_rate": 3.289960412745798e-05, "loss": 0.5014, "step": 10640 }, { "epoch": 1.033378614399379, "grad_norm": 1.641533553754366, "learning_rate": 3.288337984294893e-05, "loss": 0.5847, "step": 10650 }, { "epoch": 1.0343489229575005, "grad_norm": 1.6793350674365874, "learning_rate": 3.286715555843988e-05, "loss": 0.5433, "step": 10660 }, { "epoch": 1.0353192315156219, "grad_norm": 1.7886777094595252, "learning_rate": 3.285093127393083e-05, "loss": 0.5936, "step": 10670 }, { "epoch": 1.0362895400737435, "grad_norm": 1.7628097025479255, "learning_rate": 3.283470698942177e-05, "loss": 0.5176, "step": 10680 }, { "epoch": 1.037259848631865, "grad_norm": 1.7117894064748884, "learning_rate": 3.281848270491272e-05, "loss": 0.5166, "step": 10690 }, { "epoch": 1.0382301571899863, "grad_norm": 1.8595449574787446, "learning_rate": 3.280225842040366e-05, "loss": 0.5511, "step": 10700 }, { "epoch": 1.039200465748108, "grad_norm": 1.5833347089731324, "learning_rate": 3.278603413589461e-05, "loss": 0.5808, "step": 10710 }, { "epoch": 1.0401707743062294, "grad_norm": 1.5168916043350162, "learning_rate": 3.2769809851385555e-05, "loss": 0.5295, "step": 10720 }, { "epoch": 1.0411410828643508, "grad_norm": 1.9583788202686643, "learning_rate": 3.2753585566876505e-05, "loss": 0.5859, "step": 10730 }, { "epoch": 1.0421113914224724, "grad_norm": 1.6490551146953607, "learning_rate": 3.273736128236745e-05, "loss": 0.5988, "step": 10740 }, { "epoch": 1.0430816999805939, "grad_norm": 2.2945646534561734, "learning_rate": 3.27211369978584e-05, "loss": 0.5481, "step": 10750 }, { "epoch": 1.0440520085387153, "grad_norm": 1.6823894633457166, "learning_rate": 3.270491271334935e-05, "loss": 0.566, "step": 10760 }, { "epoch": 1.0450223170968367, "grad_norm": 1.9698263256075523, "learning_rate": 3.268868842884029e-05, "loss": 0.582, "step": 10770 }, { "epoch": 1.0459926256549583, "grad_norm": 1.6140673075775909, "learning_rate": 3.267246414433124e-05, "loss": 0.4717, "step": 10780 }, { "epoch": 1.0469629342130797, "grad_norm": 1.916378749987929, "learning_rate": 3.265623985982218e-05, "loss": 0.5244, "step": 10790 }, { "epoch": 1.0479332427712011, "grad_norm": 1.9482061690949193, "learning_rate": 3.264001557531313e-05, "loss": 0.5973, "step": 10800 }, { "epoch": 1.0489035513293228, "grad_norm": 2.142491022649256, "learning_rate": 3.2623791290804076e-05, "loss": 0.5317, "step": 10810 }, { "epoch": 1.0498738598874442, "grad_norm": 1.706373160041174, "learning_rate": 3.2607567006295026e-05, "loss": 0.5136, "step": 10820 }, { "epoch": 1.0508441684455656, "grad_norm": 1.8959475848770957, "learning_rate": 3.2591342721785976e-05, "loss": 0.5655, "step": 10830 }, { "epoch": 1.0518144770036872, "grad_norm": 1.8921424824926663, "learning_rate": 3.257511843727692e-05, "loss": 0.5164, "step": 10840 }, { "epoch": 1.0527847855618087, "grad_norm": 2.0234555341020664, "learning_rate": 3.255889415276787e-05, "loss": 0.5335, "step": 10850 }, { "epoch": 1.05375509411993, "grad_norm": 2.0614821240519947, "learning_rate": 3.254266986825881e-05, "loss": 0.5022, "step": 10860 }, { "epoch": 1.0547254026780517, "grad_norm": 1.7142183353166638, "learning_rate": 3.252644558374976e-05, "loss": 0.5452, "step": 10870 }, { "epoch": 1.0556957112361731, "grad_norm": 1.8265079239939517, "learning_rate": 3.2510221299240704e-05, "loss": 0.5302, "step": 10880 }, { "epoch": 1.0566660197942945, "grad_norm": 1.7953764101996608, "learning_rate": 3.2493997014731654e-05, "loss": 0.5418, "step": 10890 }, { "epoch": 1.0576363283524162, "grad_norm": 1.8762589557600082, "learning_rate": 3.24777727302226e-05, "loss": 0.5743, "step": 10900 }, { "epoch": 1.0586066369105376, "grad_norm": 1.4926319666858994, "learning_rate": 3.246154844571355e-05, "loss": 0.5291, "step": 10910 }, { "epoch": 1.059576945468659, "grad_norm": 1.9278698099129787, "learning_rate": 3.2445324161204497e-05, "loss": 0.5409, "step": 10920 }, { "epoch": 1.0605472540267806, "grad_norm": 1.8973044877439684, "learning_rate": 3.242909987669544e-05, "loss": 0.5466, "step": 10930 }, { "epoch": 1.061517562584902, "grad_norm": 2.174563710599117, "learning_rate": 3.241287559218639e-05, "loss": 0.516, "step": 10940 }, { "epoch": 1.0624878711430235, "grad_norm": 2.1559495316330786, "learning_rate": 3.239665130767733e-05, "loss": 0.5951, "step": 10950 }, { "epoch": 1.0634581797011449, "grad_norm": 2.224958308598168, "learning_rate": 3.238042702316828e-05, "loss": 0.5216, "step": 10960 }, { "epoch": 1.0644284882592665, "grad_norm": 1.5388616251824314, "learning_rate": 3.2364202738659225e-05, "loss": 0.551, "step": 10970 }, { "epoch": 1.065398796817388, "grad_norm": 1.8563910144302744, "learning_rate": 3.2347978454150175e-05, "loss": 0.5325, "step": 10980 }, { "epoch": 1.0663691053755093, "grad_norm": 1.249386464606806, "learning_rate": 3.233175416964112e-05, "loss": 0.5223, "step": 10990 }, { "epoch": 1.067339413933631, "grad_norm": 1.7788595735717652, "learning_rate": 3.231552988513207e-05, "loss": 0.5353, "step": 11000 }, { "epoch": 1.067339413933631, "eval_loss": 0.6541542410850525, "eval_runtime": 2467.7272, "eval_samples_per_second": 0.726, "eval_steps_per_second": 0.363, "step": 11000 }, { "epoch": 1.0683097224917524, "grad_norm": 2.093397486341294, "learning_rate": 3.229930560062302e-05, "loss": 0.5373, "step": 11010 }, { "epoch": 1.0692800310498738, "grad_norm": 1.6292327190910305, "learning_rate": 3.228308131611396e-05, "loss": 0.5417, "step": 11020 }, { "epoch": 1.0702503396079954, "grad_norm": 1.6545192933610855, "learning_rate": 3.226685703160491e-05, "loss": 0.5508, "step": 11030 }, { "epoch": 1.0712206481661168, "grad_norm": 1.8906889663679678, "learning_rate": 3.225063274709585e-05, "loss": 0.5268, "step": 11040 }, { "epoch": 1.0721909567242383, "grad_norm": 1.392471317144995, "learning_rate": 3.22344084625868e-05, "loss": 0.5226, "step": 11050 }, { "epoch": 1.0731612652823599, "grad_norm": 1.9171337047147172, "learning_rate": 3.2218184178077746e-05, "loss": 0.5212, "step": 11060 }, { "epoch": 1.0741315738404813, "grad_norm": 1.4857874915171814, "learning_rate": 3.2201959893568696e-05, "loss": 0.5335, "step": 11070 }, { "epoch": 1.0751018823986027, "grad_norm": 1.7538116790409093, "learning_rate": 3.218573560905964e-05, "loss": 0.542, "step": 11080 }, { "epoch": 1.0760721909567241, "grad_norm": 1.6310285074576256, "learning_rate": 3.216951132455059e-05, "loss": 0.5114, "step": 11090 }, { "epoch": 1.0770424995148458, "grad_norm": 1.6921457584921478, "learning_rate": 3.215328704004154e-05, "loss": 0.4924, "step": 11100 }, { "epoch": 1.0780128080729672, "grad_norm": 1.7745242342830565, "learning_rate": 3.213706275553248e-05, "loss": 0.5567, "step": 11110 }, { "epoch": 1.0789831166310886, "grad_norm": 1.69496703066604, "learning_rate": 3.212083847102343e-05, "loss": 0.5237, "step": 11120 }, { "epoch": 1.0799534251892102, "grad_norm": 1.7029551999976154, "learning_rate": 3.2104614186514374e-05, "loss": 0.5023, "step": 11130 }, { "epoch": 1.0809237337473316, "grad_norm": 1.71464384875071, "learning_rate": 3.2088389902005324e-05, "loss": 0.5111, "step": 11140 }, { "epoch": 1.081894042305453, "grad_norm": 1.745797205638, "learning_rate": 3.207216561749627e-05, "loss": 0.5259, "step": 11150 }, { "epoch": 1.0828643508635747, "grad_norm": 1.8756703389755518, "learning_rate": 3.2055941332987216e-05, "loss": 0.5135, "step": 11160 }, { "epoch": 1.083834659421696, "grad_norm": 1.7773801380695633, "learning_rate": 3.203971704847816e-05, "loss": 0.606, "step": 11170 }, { "epoch": 1.0848049679798175, "grad_norm": 1.9149914262933831, "learning_rate": 3.202349276396911e-05, "loss": 0.504, "step": 11180 }, { "epoch": 1.0857752765379391, "grad_norm": 1.5792891192161234, "learning_rate": 3.200726847946006e-05, "loss": 0.4899, "step": 11190 }, { "epoch": 1.0867455850960606, "grad_norm": 1.3849113367423667, "learning_rate": 3.1991044194951e-05, "loss": 0.5435, "step": 11200 }, { "epoch": 1.087715893654182, "grad_norm": 1.836894182259094, "learning_rate": 3.197481991044195e-05, "loss": 0.5417, "step": 11210 }, { "epoch": 1.0886862022123036, "grad_norm": 2.5716819476436723, "learning_rate": 3.1958595625932895e-05, "loss": 0.5487, "step": 11220 }, { "epoch": 1.089656510770425, "grad_norm": 1.7054225586630418, "learning_rate": 3.1942371341423845e-05, "loss": 0.5366, "step": 11230 }, { "epoch": 1.0906268193285464, "grad_norm": 1.9132021835433188, "learning_rate": 3.192614705691479e-05, "loss": 0.5212, "step": 11240 }, { "epoch": 1.091597127886668, "grad_norm": 1.5687725973259348, "learning_rate": 3.190992277240574e-05, "loss": 0.5013, "step": 11250 }, { "epoch": 1.0925674364447895, "grad_norm": 1.7910741705827617, "learning_rate": 3.189369848789669e-05, "loss": 0.5359, "step": 11260 }, { "epoch": 1.093537745002911, "grad_norm": 1.4238221915445326, "learning_rate": 3.187747420338763e-05, "loss": 0.5729, "step": 11270 }, { "epoch": 1.0945080535610323, "grad_norm": 1.8958035882900321, "learning_rate": 3.186124991887858e-05, "loss": 0.5349, "step": 11280 }, { "epoch": 1.095478362119154, "grad_norm": 1.5644842460614365, "learning_rate": 3.184502563436952e-05, "loss": 0.5275, "step": 11290 }, { "epoch": 1.0964486706772754, "grad_norm": 1.6966656078568068, "learning_rate": 3.182880134986047e-05, "loss": 0.5714, "step": 11300 }, { "epoch": 1.0974189792353968, "grad_norm": 1.5845176878742038, "learning_rate": 3.1812577065351416e-05, "loss": 0.547, "step": 11310 }, { "epoch": 1.0983892877935184, "grad_norm": 1.9638352416110092, "learning_rate": 3.1796352780842365e-05, "loss": 0.5371, "step": 11320 }, { "epoch": 1.0993595963516398, "grad_norm": 1.3333867282862815, "learning_rate": 3.178012849633331e-05, "loss": 0.5558, "step": 11330 }, { "epoch": 1.1003299049097612, "grad_norm": 2.028988375070847, "learning_rate": 3.176390421182426e-05, "loss": 0.5425, "step": 11340 }, { "epoch": 1.1013002134678829, "grad_norm": 2.0233727263417745, "learning_rate": 3.174767992731521e-05, "loss": 0.5128, "step": 11350 }, { "epoch": 1.1022705220260043, "grad_norm": 1.6415117808780524, "learning_rate": 3.173145564280615e-05, "loss": 0.476, "step": 11360 }, { "epoch": 1.1032408305841257, "grad_norm": 2.0488869521619972, "learning_rate": 3.17152313582971e-05, "loss": 0.5221, "step": 11370 }, { "epoch": 1.1042111391422473, "grad_norm": 2.11079167436828, "learning_rate": 3.1699007073788044e-05, "loss": 0.5566, "step": 11380 }, { "epoch": 1.1051814477003687, "grad_norm": 2.0421176306313398, "learning_rate": 3.1682782789278993e-05, "loss": 0.5061, "step": 11390 }, { "epoch": 1.1061517562584902, "grad_norm": 1.9992500086060474, "learning_rate": 3.1666558504769936e-05, "loss": 0.542, "step": 11400 }, { "epoch": 1.1071220648166116, "grad_norm": 1.872752246092324, "learning_rate": 3.1650334220260886e-05, "loss": 0.5602, "step": 11410 }, { "epoch": 1.1080923733747332, "grad_norm": 2.0723616338374646, "learning_rate": 3.163410993575183e-05, "loss": 0.5052, "step": 11420 }, { "epoch": 1.1090626819328546, "grad_norm": 1.7543258598505558, "learning_rate": 3.161788565124278e-05, "loss": 0.5951, "step": 11430 }, { "epoch": 1.110032990490976, "grad_norm": 1.741111503275317, "learning_rate": 3.160166136673373e-05, "loss": 0.5481, "step": 11440 }, { "epoch": 1.1110032990490977, "grad_norm": 1.3801479018015765, "learning_rate": 3.158543708222468e-05, "loss": 0.5104, "step": 11450 }, { "epoch": 1.111973607607219, "grad_norm": 1.9442170634773426, "learning_rate": 3.156921279771563e-05, "loss": 0.5129, "step": 11460 }, { "epoch": 1.1129439161653405, "grad_norm": 1.9331832081031561, "learning_rate": 3.155298851320657e-05, "loss": 0.5139, "step": 11470 }, { "epoch": 1.1139142247234621, "grad_norm": 1.5810959938815903, "learning_rate": 3.153676422869752e-05, "loss": 0.603, "step": 11480 }, { "epoch": 1.1148845332815835, "grad_norm": 1.7478463817804297, "learning_rate": 3.1520539944188464e-05, "loss": 0.5573, "step": 11490 }, { "epoch": 1.115854841839705, "grad_norm": 1.512993291404137, "learning_rate": 3.1504315659679414e-05, "loss": 0.4983, "step": 11500 }, { "epoch": 1.115854841839705, "eval_loss": 0.6542506814002991, "eval_runtime": 2471.8207, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.362, "step": 11500 }, { "epoch": 1.1168251503978266, "grad_norm": 1.9794498366078679, "learning_rate": 3.148809137517036e-05, "loss": 0.5397, "step": 11510 }, { "epoch": 1.117795458955948, "grad_norm": 1.6989190060539383, "learning_rate": 3.1471867090661307e-05, "loss": 0.4416, "step": 11520 }, { "epoch": 1.1187657675140694, "grad_norm": 1.8417193394056512, "learning_rate": 3.145564280615225e-05, "loss": 0.5761, "step": 11530 }, { "epoch": 1.119736076072191, "grad_norm": 1.8784314737872851, "learning_rate": 3.14394185216432e-05, "loss": 0.5179, "step": 11540 }, { "epoch": 1.1207063846303125, "grad_norm": 1.724566892104882, "learning_rate": 3.142319423713415e-05, "loss": 0.5145, "step": 11550 }, { "epoch": 1.1216766931884339, "grad_norm": 1.6271566863595974, "learning_rate": 3.140696995262509e-05, "loss": 0.5557, "step": 11560 }, { "epoch": 1.1226470017465555, "grad_norm": 1.632474409823301, "learning_rate": 3.139074566811604e-05, "loss": 0.5351, "step": 11570 }, { "epoch": 1.123617310304677, "grad_norm": 1.8885164133175332, "learning_rate": 3.1374521383606985e-05, "loss": 0.5066, "step": 11580 }, { "epoch": 1.1245876188627983, "grad_norm": 1.86325603833634, "learning_rate": 3.1358297099097935e-05, "loss": 0.5561, "step": 11590 }, { "epoch": 1.1255579274209198, "grad_norm": 1.8325010805036752, "learning_rate": 3.134207281458888e-05, "loss": 0.5518, "step": 11600 }, { "epoch": 1.1265282359790414, "grad_norm": 1.7732624170282982, "learning_rate": 3.132584853007983e-05, "loss": 0.5264, "step": 11610 }, { "epoch": 1.1274985445371628, "grad_norm": 1.6849511508626207, "learning_rate": 3.130962424557077e-05, "loss": 0.4938, "step": 11620 }, { "epoch": 1.1284688530952842, "grad_norm": 1.7088287322634965, "learning_rate": 3.129339996106172e-05, "loss": 0.5432, "step": 11630 }, { "epoch": 1.1294391616534059, "grad_norm": 2.4521787808904345, "learning_rate": 3.127717567655267e-05, "loss": 0.5222, "step": 11640 }, { "epoch": 1.1304094702115273, "grad_norm": 1.8568933343609308, "learning_rate": 3.126095139204361e-05, "loss": 0.5155, "step": 11650 }, { "epoch": 1.1313797787696487, "grad_norm": 1.6581222418410275, "learning_rate": 3.124472710753456e-05, "loss": 0.5039, "step": 11660 }, { "epoch": 1.1323500873277703, "grad_norm": 1.8783913354540864, "learning_rate": 3.1228502823025506e-05, "loss": 0.5083, "step": 11670 }, { "epoch": 1.1333203958858917, "grad_norm": 1.7156898303987873, "learning_rate": 3.1212278538516455e-05, "loss": 0.5184, "step": 11680 }, { "epoch": 1.1342907044440131, "grad_norm": 2.1185799396739093, "learning_rate": 3.11960542540074e-05, "loss": 0.5252, "step": 11690 }, { "epoch": 1.1352610130021348, "grad_norm": 1.2978327151961555, "learning_rate": 3.117982996949835e-05, "loss": 0.5548, "step": 11700 }, { "epoch": 1.1362313215602562, "grad_norm": 1.6276998380365333, "learning_rate": 3.116360568498929e-05, "loss": 0.5412, "step": 11710 }, { "epoch": 1.1372016301183776, "grad_norm": 1.5330498087853446, "learning_rate": 3.114738140048024e-05, "loss": 0.4811, "step": 11720 }, { "epoch": 1.138171938676499, "grad_norm": 1.545547307525802, "learning_rate": 3.113115711597119e-05, "loss": 0.4841, "step": 11730 }, { "epoch": 1.1391422472346207, "grad_norm": 1.8963148998956563, "learning_rate": 3.1114932831462134e-05, "loss": 0.5048, "step": 11740 }, { "epoch": 1.140112555792742, "grad_norm": 1.927144240645965, "learning_rate": 3.1098708546953084e-05, "loss": 0.5437, "step": 11750 }, { "epoch": 1.1410828643508635, "grad_norm": 1.322758693894623, "learning_rate": 3.1082484262444027e-05, "loss": 0.5824, "step": 11760 }, { "epoch": 1.1420531729089851, "grad_norm": 1.9210557222184033, "learning_rate": 3.1066259977934976e-05, "loss": 0.5208, "step": 11770 }, { "epoch": 1.1430234814671065, "grad_norm": 1.8904898905530507, "learning_rate": 3.105003569342592e-05, "loss": 0.5646, "step": 11780 }, { "epoch": 1.143993790025228, "grad_norm": 2.2135650729717478, "learning_rate": 3.103381140891687e-05, "loss": 0.5823, "step": 11790 }, { "epoch": 1.1449640985833496, "grad_norm": 1.8257267748894057, "learning_rate": 3.101758712440781e-05, "loss": 0.4957, "step": 11800 }, { "epoch": 1.145934407141471, "grad_norm": 1.8399910987928954, "learning_rate": 3.100136283989876e-05, "loss": 0.5589, "step": 11810 }, { "epoch": 1.1469047156995924, "grad_norm": 1.751016659591738, "learning_rate": 3.098513855538971e-05, "loss": 0.4636, "step": 11820 }, { "epoch": 1.147875024257714, "grad_norm": 2.3784962245655743, "learning_rate": 3.0968914270880655e-05, "loss": 0.558, "step": 11830 }, { "epoch": 1.1488453328158355, "grad_norm": 1.7453204083727232, "learning_rate": 3.0952689986371604e-05, "loss": 0.5152, "step": 11840 }, { "epoch": 1.1498156413739569, "grad_norm": 1.6493432346857455, "learning_rate": 3.093646570186255e-05, "loss": 0.5396, "step": 11850 }, { "epoch": 1.1507859499320783, "grad_norm": 1.4016634963337722, "learning_rate": 3.09202414173535e-05, "loss": 0.5392, "step": 11860 }, { "epoch": 1.1517562584902, "grad_norm": 1.5144392427573337, "learning_rate": 3.090401713284444e-05, "loss": 0.5625, "step": 11870 }, { "epoch": 1.1527265670483213, "grad_norm": 1.9911432006869803, "learning_rate": 3.088779284833539e-05, "loss": 0.5187, "step": 11880 }, { "epoch": 1.153696875606443, "grad_norm": 1.989669700307941, "learning_rate": 3.087156856382634e-05, "loss": 0.516, "step": 11890 }, { "epoch": 1.1546671841645644, "grad_norm": 2.0321346469591717, "learning_rate": 3.085534427931728e-05, "loss": 0.5028, "step": 11900 }, { "epoch": 1.1556374927226858, "grad_norm": 1.6419270337849394, "learning_rate": 3.083911999480823e-05, "loss": 0.5114, "step": 11910 }, { "epoch": 1.1566078012808072, "grad_norm": 1.8199861134594042, "learning_rate": 3.0822895710299175e-05, "loss": 0.4862, "step": 11920 }, { "epoch": 1.1575781098389288, "grad_norm": 2.092599945357918, "learning_rate": 3.0806671425790125e-05, "loss": 0.4862, "step": 11930 }, { "epoch": 1.1585484183970503, "grad_norm": 1.6390208725289623, "learning_rate": 3.079044714128107e-05, "loss": 0.5518, "step": 11940 }, { "epoch": 1.1595187269551717, "grad_norm": 1.4035901760825538, "learning_rate": 3.077422285677202e-05, "loss": 0.5195, "step": 11950 }, { "epoch": 1.1604890355132933, "grad_norm": 1.965204556535071, "learning_rate": 3.075799857226296e-05, "loss": 0.5198, "step": 11960 }, { "epoch": 1.1614593440714147, "grad_norm": 1.797317897425162, "learning_rate": 3.074177428775391e-05, "loss": 0.5223, "step": 11970 }, { "epoch": 1.1624296526295361, "grad_norm": 2.0562783101788713, "learning_rate": 3.072555000324486e-05, "loss": 0.5207, "step": 11980 }, { "epoch": 1.1633999611876578, "grad_norm": 1.674272970925229, "learning_rate": 3.0709325718735804e-05, "loss": 0.5034, "step": 11990 }, { "epoch": 1.1643702697457792, "grad_norm": 1.8613409955734859, "learning_rate": 3.069310143422675e-05, "loss": 0.4921, "step": 12000 }, { "epoch": 1.1643702697457792, "eval_loss": 0.6506599187850952, "eval_runtime": 2467.2044, "eval_samples_per_second": 0.726, "eval_steps_per_second": 0.363, "step": 12000 }, { "epoch": 1.1653405783039006, "grad_norm": 1.561512098916604, "learning_rate": 3.0676877149717696e-05, "loss": 0.5497, "step": 12010 }, { "epoch": 1.1663108868620222, "grad_norm": 1.865230398957055, "learning_rate": 3.0660652865208646e-05, "loss": 0.4925, "step": 12020 }, { "epoch": 1.1672811954201436, "grad_norm": 2.2512724716981434, "learning_rate": 3.064442858069959e-05, "loss": 0.5295, "step": 12030 }, { "epoch": 1.168251503978265, "grad_norm": 1.8002247664216207, "learning_rate": 3.062820429619054e-05, "loss": 0.4996, "step": 12040 }, { "epoch": 1.1692218125363865, "grad_norm": 2.1116351677440943, "learning_rate": 3.061198001168148e-05, "loss": 0.5191, "step": 12050 }, { "epoch": 1.170192121094508, "grad_norm": 1.6725748372150215, "learning_rate": 3.059575572717243e-05, "loss": 0.5336, "step": 12060 }, { "epoch": 1.1711624296526295, "grad_norm": 1.9025453675497637, "learning_rate": 3.057953144266338e-05, "loss": 0.5188, "step": 12070 }, { "epoch": 1.172132738210751, "grad_norm": 1.8069530215074574, "learning_rate": 3.0563307158154324e-05, "loss": 0.5495, "step": 12080 }, { "epoch": 1.1731030467688726, "grad_norm": 1.4756589453476456, "learning_rate": 3.0547082873645274e-05, "loss": 0.5248, "step": 12090 }, { "epoch": 1.174073355326994, "grad_norm": 1.894556403114157, "learning_rate": 3.053085858913622e-05, "loss": 0.5604, "step": 12100 }, { "epoch": 1.1750436638851154, "grad_norm": 1.7674666934401635, "learning_rate": 3.0514634304627167e-05, "loss": 0.5355, "step": 12110 }, { "epoch": 1.176013972443237, "grad_norm": 1.6165024582050291, "learning_rate": 3.0498410020118113e-05, "loss": 0.4861, "step": 12120 }, { "epoch": 1.1769842810013584, "grad_norm": 1.7586790770235476, "learning_rate": 3.048218573560906e-05, "loss": 0.5009, "step": 12130 }, { "epoch": 1.1779545895594798, "grad_norm": 1.424893034597034, "learning_rate": 3.0465961451100006e-05, "loss": 0.5441, "step": 12140 }, { "epoch": 1.1789248981176015, "grad_norm": 1.79883718769592, "learning_rate": 3.0449737166590952e-05, "loss": 0.5313, "step": 12150 }, { "epoch": 1.179895206675723, "grad_norm": 1.2166009796658896, "learning_rate": 3.04335128820819e-05, "loss": 0.5771, "step": 12160 }, { "epoch": 1.1808655152338443, "grad_norm": 1.8319881864090852, "learning_rate": 3.0417288597572845e-05, "loss": 0.5341, "step": 12170 }, { "epoch": 1.1818358237919657, "grad_norm": 2.000590117325487, "learning_rate": 3.040106431306379e-05, "loss": 0.5434, "step": 12180 }, { "epoch": 1.1828061323500874, "grad_norm": 1.7597266341378504, "learning_rate": 3.038484002855474e-05, "loss": 0.4637, "step": 12190 }, { "epoch": 1.1837764409082088, "grad_norm": 1.7547988130290746, "learning_rate": 3.0368615744045688e-05, "loss": 0.5238, "step": 12200 }, { "epoch": 1.1847467494663304, "grad_norm": 2.072598486347679, "learning_rate": 3.0352391459536634e-05, "loss": 0.5198, "step": 12210 }, { "epoch": 1.1857170580244518, "grad_norm": 1.801682666637873, "learning_rate": 3.033616717502758e-05, "loss": 0.5549, "step": 12220 }, { "epoch": 1.1866873665825732, "grad_norm": 1.7660188156759438, "learning_rate": 3.0319942890518534e-05, "loss": 0.5415, "step": 12230 }, { "epoch": 1.1876576751406946, "grad_norm": 1.531355321026242, "learning_rate": 3.030371860600948e-05, "loss": 0.5125, "step": 12240 }, { "epoch": 1.1886279836988163, "grad_norm": 1.6289177265084471, "learning_rate": 3.0287494321500426e-05, "loss": 0.5131, "step": 12250 }, { "epoch": 1.1895982922569377, "grad_norm": 2.3228651312748423, "learning_rate": 3.0271270036991373e-05, "loss": 0.4859, "step": 12260 }, { "epoch": 1.190568600815059, "grad_norm": 2.076037584027454, "learning_rate": 3.025504575248232e-05, "loss": 0.5234, "step": 12270 }, { "epoch": 1.1915389093731807, "grad_norm": 1.4111596404765325, "learning_rate": 3.0238821467973266e-05, "loss": 0.5226, "step": 12280 }, { "epoch": 1.1925092179313022, "grad_norm": 1.976122066748625, "learning_rate": 3.0222597183464212e-05, "loss": 0.5753, "step": 12290 }, { "epoch": 1.1934795264894236, "grad_norm": 1.6905440972221555, "learning_rate": 3.0206372898955158e-05, "loss": 0.5559, "step": 12300 }, { "epoch": 1.1944498350475452, "grad_norm": 1.549005200001293, "learning_rate": 3.0190148614446108e-05, "loss": 0.5199, "step": 12310 }, { "epoch": 1.1954201436056666, "grad_norm": 1.7886858738633538, "learning_rate": 3.0173924329937054e-05, "loss": 0.5524, "step": 12320 }, { "epoch": 1.196390452163788, "grad_norm": 2.226393124620313, "learning_rate": 3.0157700045428e-05, "loss": 0.511, "step": 12330 }, { "epoch": 1.1973607607219097, "grad_norm": 1.84134054205334, "learning_rate": 3.0141475760918947e-05, "loss": 0.5537, "step": 12340 }, { "epoch": 1.198331069280031, "grad_norm": 1.938676760265039, "learning_rate": 3.0125251476409894e-05, "loss": 0.5782, "step": 12350 }, { "epoch": 1.1993013778381525, "grad_norm": 1.6671922582631973, "learning_rate": 3.010902719190084e-05, "loss": 0.5146, "step": 12360 }, { "epoch": 1.200271686396274, "grad_norm": 2.09828561481085, "learning_rate": 3.0092802907391786e-05, "loss": 0.5292, "step": 12370 }, { "epoch": 1.2012419949543955, "grad_norm": 1.782738185716262, "learning_rate": 3.0076578622882733e-05, "loss": 0.5797, "step": 12380 }, { "epoch": 1.202212303512517, "grad_norm": 1.7386393975624728, "learning_rate": 3.006035433837368e-05, "loss": 0.5459, "step": 12390 }, { "epoch": 1.2031826120706384, "grad_norm": 1.834619533516341, "learning_rate": 3.004413005386463e-05, "loss": 0.5711, "step": 12400 }, { "epoch": 1.20415292062876, "grad_norm": 1.6555949649402653, "learning_rate": 3.0027905769355575e-05, "loss": 0.5447, "step": 12410 }, { "epoch": 1.2051232291868814, "grad_norm": 1.997714330931175, "learning_rate": 3.001168148484652e-05, "loss": 0.5001, "step": 12420 }, { "epoch": 1.2060935377450028, "grad_norm": 1.943566532939107, "learning_rate": 2.9995457200337468e-05, "loss": 0.5612, "step": 12430 }, { "epoch": 1.2070638463031245, "grad_norm": 2.295630424579193, "learning_rate": 2.9979232915828414e-05, "loss": 0.5443, "step": 12440 }, { "epoch": 1.2080341548612459, "grad_norm": 1.6087261613477206, "learning_rate": 2.996300863131936e-05, "loss": 0.4929, "step": 12450 }, { "epoch": 1.2090044634193673, "grad_norm": 1.7507917723489945, "learning_rate": 2.9946784346810307e-05, "loss": 0.4915, "step": 12460 }, { "epoch": 1.209974771977489, "grad_norm": 1.712263283374138, "learning_rate": 2.9930560062301254e-05, "loss": 0.5184, "step": 12470 }, { "epoch": 1.2109450805356103, "grad_norm": 2.1091363117437427, "learning_rate": 2.9914335777792203e-05, "loss": 0.5123, "step": 12480 }, { "epoch": 1.2119153890937318, "grad_norm": 2.404178772933857, "learning_rate": 2.989811149328315e-05, "loss": 0.4568, "step": 12490 }, { "epoch": 1.2128856976518532, "grad_norm": 1.6898907246159178, "learning_rate": 2.9881887208774096e-05, "loss": 0.5114, "step": 12500 }, { "epoch": 1.2128856976518532, "eval_loss": 0.6506454348564148, "eval_runtime": 2464.983, "eval_samples_per_second": 0.727, "eval_steps_per_second": 0.363, "step": 12500 }, { "epoch": 1.2138560062099748, "grad_norm": 1.5632796322726878, "learning_rate": 2.9865662924265042e-05, "loss": 0.5381, "step": 12510 }, { "epoch": 1.2148263147680962, "grad_norm": 2.2060623757238482, "learning_rate": 2.984943863975599e-05, "loss": 0.6286, "step": 12520 }, { "epoch": 1.2157966233262179, "grad_norm": 1.6896138867780373, "learning_rate": 2.9833214355246935e-05, "loss": 0.4824, "step": 12530 }, { "epoch": 1.2167669318843393, "grad_norm": 1.6264014630619223, "learning_rate": 2.981699007073788e-05, "loss": 0.5231, "step": 12540 }, { "epoch": 1.2177372404424607, "grad_norm": 2.288555955501704, "learning_rate": 2.9800765786228828e-05, "loss": 0.5415, "step": 12550 }, { "epoch": 1.218707549000582, "grad_norm": 1.6328806432164462, "learning_rate": 2.9784541501719774e-05, "loss": 0.5282, "step": 12560 }, { "epoch": 1.2196778575587037, "grad_norm": 1.9940506922760688, "learning_rate": 2.9768317217210724e-05, "loss": 0.5283, "step": 12570 }, { "epoch": 1.2206481661168251, "grad_norm": 1.9369189438911159, "learning_rate": 2.975209293270167e-05, "loss": 0.4722, "step": 12580 }, { "epoch": 1.2216184746749466, "grad_norm": 1.150006209597975, "learning_rate": 2.9735868648192617e-05, "loss": 0.5184, "step": 12590 }, { "epoch": 1.2225887832330682, "grad_norm": 1.7210167191672803, "learning_rate": 2.9719644363683563e-05, "loss": 0.5031, "step": 12600 }, { "epoch": 1.2235590917911896, "grad_norm": 1.9388550988757736, "learning_rate": 2.970342007917451e-05, "loss": 0.5099, "step": 12610 }, { "epoch": 1.224529400349311, "grad_norm": 1.6524197083323393, "learning_rate": 2.9687195794665456e-05, "loss": 0.4447, "step": 12620 }, { "epoch": 1.2254997089074326, "grad_norm": 1.89736479001966, "learning_rate": 2.9670971510156402e-05, "loss": 0.4983, "step": 12630 }, { "epoch": 1.226470017465554, "grad_norm": 1.895097527141105, "learning_rate": 2.965474722564735e-05, "loss": 0.5039, "step": 12640 }, { "epoch": 1.2274403260236755, "grad_norm": 1.976305435076919, "learning_rate": 2.96385229411383e-05, "loss": 0.5337, "step": 12650 }, { "epoch": 1.2284106345817971, "grad_norm": 1.6796293491451193, "learning_rate": 2.9622298656629245e-05, "loss": 0.506, "step": 12660 }, { "epoch": 1.2293809431399185, "grad_norm": 1.6431597372306554, "learning_rate": 2.960607437212019e-05, "loss": 0.5797, "step": 12670 }, { "epoch": 1.23035125169804, "grad_norm": 1.8020203988119472, "learning_rate": 2.9589850087611138e-05, "loss": 0.4957, "step": 12680 }, { "epoch": 1.2313215602561614, "grad_norm": 2.0575623836232935, "learning_rate": 2.9573625803102084e-05, "loss": 0.5125, "step": 12690 }, { "epoch": 1.232291868814283, "grad_norm": 2.0440836316269544, "learning_rate": 2.955740151859303e-05, "loss": 0.5186, "step": 12700 }, { "epoch": 1.2332621773724044, "grad_norm": 1.6684531467277435, "learning_rate": 2.9541177234083977e-05, "loss": 0.5348, "step": 12710 }, { "epoch": 1.2342324859305258, "grad_norm": 1.6949248831996988, "learning_rate": 2.9524952949574923e-05, "loss": 0.5484, "step": 12720 }, { "epoch": 1.2352027944886474, "grad_norm": 1.773704436016878, "learning_rate": 2.950872866506587e-05, "loss": 0.5033, "step": 12730 }, { "epoch": 1.2361731030467689, "grad_norm": 1.9246733654165475, "learning_rate": 2.949250438055682e-05, "loss": 0.5164, "step": 12740 }, { "epoch": 1.2371434116048903, "grad_norm": 1.7869787657786207, "learning_rate": 2.9476280096047766e-05, "loss": 0.549, "step": 12750 }, { "epoch": 1.238113720163012, "grad_norm": 1.5427226807712424, "learning_rate": 2.9460055811538712e-05, "loss": 0.5453, "step": 12760 }, { "epoch": 1.2390840287211333, "grad_norm": 2.1243893484204706, "learning_rate": 2.944383152702966e-05, "loss": 0.5232, "step": 12770 }, { "epoch": 1.2400543372792547, "grad_norm": 1.7624693076719502, "learning_rate": 2.9427607242520605e-05, "loss": 0.4767, "step": 12780 }, { "epoch": 1.2410246458373764, "grad_norm": 1.8048923369800416, "learning_rate": 2.941138295801155e-05, "loss": 0.5786, "step": 12790 }, { "epoch": 1.2419949543954978, "grad_norm": 1.557577350338282, "learning_rate": 2.9395158673502498e-05, "loss": 0.5358, "step": 12800 }, { "epoch": 1.2429652629536192, "grad_norm": 1.7796545697030264, "learning_rate": 2.9378934388993444e-05, "loss": 0.5447, "step": 12810 }, { "epoch": 1.2439355715117406, "grad_norm": 1.5233346835390529, "learning_rate": 2.936271010448439e-05, "loss": 0.5434, "step": 12820 }, { "epoch": 1.2449058800698622, "grad_norm": 1.8281296123454516, "learning_rate": 2.934648581997534e-05, "loss": 0.5311, "step": 12830 }, { "epoch": 1.2458761886279837, "grad_norm": 1.8761299295652716, "learning_rate": 2.9330261535466287e-05, "loss": 0.561, "step": 12840 }, { "epoch": 1.2468464971861053, "grad_norm": 1.571060091128229, "learning_rate": 2.9314037250957233e-05, "loss": 0.5293, "step": 12850 }, { "epoch": 1.2478168057442267, "grad_norm": 1.3554175730214915, "learning_rate": 2.929781296644818e-05, "loss": 0.5259, "step": 12860 }, { "epoch": 1.2487871143023481, "grad_norm": 1.5048611859450334, "learning_rate": 2.9281588681939126e-05, "loss": 0.5526, "step": 12870 }, { "epoch": 1.2497574228604695, "grad_norm": 1.944003477785508, "learning_rate": 2.9265364397430072e-05, "loss": 0.5319, "step": 12880 }, { "epoch": 1.2507277314185912, "grad_norm": 1.6137183858737645, "learning_rate": 2.924914011292102e-05, "loss": 0.5219, "step": 12890 }, { "epoch": 1.2516980399767126, "grad_norm": 1.8792239391042322, "learning_rate": 2.9232915828411965e-05, "loss": 0.4996, "step": 12900 }, { "epoch": 1.252668348534834, "grad_norm": 1.6048948758745463, "learning_rate": 2.9216691543902915e-05, "loss": 0.5094, "step": 12910 }, { "epoch": 1.2536386570929556, "grad_norm": 2.06874670039208, "learning_rate": 2.920046725939386e-05, "loss": 0.5412, "step": 12920 }, { "epoch": 1.254608965651077, "grad_norm": 1.8299215479076065, "learning_rate": 2.9184242974884808e-05, "loss": 0.5215, "step": 12930 }, { "epoch": 1.2555792742091985, "grad_norm": 1.768175531798484, "learning_rate": 2.9168018690375754e-05, "loss": 0.5466, "step": 12940 }, { "epoch": 1.2565495827673199, "grad_norm": 2.19878862821484, "learning_rate": 2.91517944058667e-05, "loss": 0.5728, "step": 12950 }, { "epoch": 1.2575198913254415, "grad_norm": 1.5410059750444967, "learning_rate": 2.9135570121357647e-05, "loss": 0.5248, "step": 12960 }, { "epoch": 1.258490199883563, "grad_norm": 1.9319057652262193, "learning_rate": 2.9119345836848593e-05, "loss": 0.4906, "step": 12970 }, { "epoch": 1.2594605084416846, "grad_norm": 1.4285564772766584, "learning_rate": 2.910312155233954e-05, "loss": 0.4976, "step": 12980 }, { "epoch": 1.260430816999806, "grad_norm": 1.9098028563216323, "learning_rate": 2.9086897267830486e-05, "loss": 0.4841, "step": 12990 }, { "epoch": 1.2614011255579274, "grad_norm": 2.155481185125366, "learning_rate": 2.9070672983321436e-05, "loss": 0.5486, "step": 13000 }, { "epoch": 1.2614011255579274, "eval_loss": 0.6476565003395081, "eval_runtime": 2469.4767, "eval_samples_per_second": 0.726, "eval_steps_per_second": 0.363, "step": 13000 }, { "epoch": 1.2623714341160488, "grad_norm": 1.7583302051951548, "learning_rate": 2.9054448698812385e-05, "loss": 0.5287, "step": 13010 }, { "epoch": 1.2633417426741704, "grad_norm": 2.1364231232324777, "learning_rate": 2.9038224414303332e-05, "loss": 0.5338, "step": 13020 }, { "epoch": 1.2643120512322918, "grad_norm": 1.858948662905276, "learning_rate": 2.902200012979428e-05, "loss": 0.5371, "step": 13030 }, { "epoch": 1.2652823597904135, "grad_norm": 1.434526008901497, "learning_rate": 2.9005775845285228e-05, "loss": 0.5199, "step": 13040 }, { "epoch": 1.266252668348535, "grad_norm": 1.8326954509581932, "learning_rate": 2.8989551560776174e-05, "loss": 0.4626, "step": 13050 }, { "epoch": 1.2672229769066563, "grad_norm": 1.929734623290484, "learning_rate": 2.897332727626712e-05, "loss": 0.5095, "step": 13060 }, { "epoch": 1.2681932854647777, "grad_norm": 1.7342388298669522, "learning_rate": 2.8957102991758067e-05, "loss": 0.5334, "step": 13070 }, { "epoch": 1.2691635940228994, "grad_norm": 1.8419675684147427, "learning_rate": 2.8940878707249013e-05, "loss": 0.5215, "step": 13080 }, { "epoch": 1.2701339025810208, "grad_norm": 2.0003719776284434, "learning_rate": 2.892465442273996e-05, "loss": 0.5291, "step": 13090 }, { "epoch": 1.2711042111391422, "grad_norm": 1.5672070400575155, "learning_rate": 2.8908430138230906e-05, "loss": 0.5302, "step": 13100 }, { "epoch": 1.2720745196972638, "grad_norm": 1.6329476161915306, "learning_rate": 2.8892205853721856e-05, "loss": 0.5004, "step": 13110 }, { "epoch": 1.2730448282553852, "grad_norm": 1.3430643020518607, "learning_rate": 2.8875981569212802e-05, "loss": 0.5586, "step": 13120 }, { "epoch": 1.2740151368135066, "grad_norm": 1.865116300464639, "learning_rate": 2.885975728470375e-05, "loss": 0.5176, "step": 13130 }, { "epoch": 1.274985445371628, "grad_norm": 1.9537429368278099, "learning_rate": 2.8843533000194695e-05, "loss": 0.5167, "step": 13140 }, { "epoch": 1.2759557539297497, "grad_norm": 2.059620522389882, "learning_rate": 2.882730871568564e-05, "loss": 0.4654, "step": 13150 }, { "epoch": 1.276926062487871, "grad_norm": 2.1139159674960317, "learning_rate": 2.8811084431176588e-05, "loss": 0.5365, "step": 13160 }, { "epoch": 1.2778963710459927, "grad_norm": 1.9675564766085183, "learning_rate": 2.8794860146667534e-05, "loss": 0.5567, "step": 13170 }, { "epoch": 1.2788666796041142, "grad_norm": 1.4714857260607748, "learning_rate": 2.877863586215848e-05, "loss": 0.5543, "step": 13180 }, { "epoch": 1.2798369881622356, "grad_norm": 1.997029571976202, "learning_rate": 2.8762411577649427e-05, "loss": 0.5349, "step": 13190 }, { "epoch": 1.280807296720357, "grad_norm": 1.7739211918572755, "learning_rate": 2.8746187293140377e-05, "loss": 0.4893, "step": 13200 }, { "epoch": 1.2817776052784786, "grad_norm": 1.7069871948143287, "learning_rate": 2.8729963008631323e-05, "loss": 0.4907, "step": 13210 }, { "epoch": 1.2827479138366, "grad_norm": 1.9981954512910625, "learning_rate": 2.871373872412227e-05, "loss": 0.5672, "step": 13220 }, { "epoch": 1.2837182223947214, "grad_norm": 1.6613620495892372, "learning_rate": 2.8697514439613216e-05, "loss": 0.541, "step": 13230 }, { "epoch": 1.284688530952843, "grad_norm": 2.3272590952107675, "learning_rate": 2.8681290155104162e-05, "loss": 0.5411, "step": 13240 }, { "epoch": 1.2856588395109645, "grad_norm": 2.12223916886103, "learning_rate": 2.866506587059511e-05, "loss": 0.4957, "step": 13250 }, { "epoch": 1.286629148069086, "grad_norm": 1.4853001948712596, "learning_rate": 2.8648841586086055e-05, "loss": 0.5432, "step": 13260 }, { "epoch": 1.2875994566272073, "grad_norm": 2.095035318310476, "learning_rate": 2.8632617301577e-05, "loss": 0.5466, "step": 13270 }, { "epoch": 1.288569765185329, "grad_norm": 1.8562546228217478, "learning_rate": 2.8616393017067948e-05, "loss": 0.5393, "step": 13280 }, { "epoch": 1.2895400737434504, "grad_norm": 1.960721539695441, "learning_rate": 2.8600168732558898e-05, "loss": 0.5737, "step": 13290 }, { "epoch": 1.290510382301572, "grad_norm": 1.8998278241439668, "learning_rate": 2.8583944448049844e-05, "loss": 0.4965, "step": 13300 }, { "epoch": 1.2914806908596934, "grad_norm": 1.648594607244266, "learning_rate": 2.856772016354079e-05, "loss": 0.5691, "step": 13310 }, { "epoch": 1.2924509994178148, "grad_norm": 1.65007773003003, "learning_rate": 2.8551495879031737e-05, "loss": 0.5, "step": 13320 }, { "epoch": 1.2934213079759362, "grad_norm": 2.0573651261935293, "learning_rate": 2.8535271594522683e-05, "loss": 0.4926, "step": 13330 }, { "epoch": 1.2943916165340579, "grad_norm": 2.328992496419527, "learning_rate": 2.851904731001363e-05, "loss": 0.5221, "step": 13340 }, { "epoch": 1.2953619250921793, "grad_norm": 1.8509833991470588, "learning_rate": 2.8502823025504576e-05, "loss": 0.5018, "step": 13350 }, { "epoch": 1.296332233650301, "grad_norm": 1.895399492110669, "learning_rate": 2.8486598740995522e-05, "loss": 0.4724, "step": 13360 }, { "epoch": 1.2973025422084223, "grad_norm": 1.667924410333677, "learning_rate": 2.8470374456486472e-05, "loss": 0.5165, "step": 13370 }, { "epoch": 1.2982728507665438, "grad_norm": 2.1618438681975722, "learning_rate": 2.845415017197742e-05, "loss": 0.5208, "step": 13380 }, { "epoch": 1.2992431593246652, "grad_norm": 2.0324515358778434, "learning_rate": 2.8437925887468365e-05, "loss": 0.4944, "step": 13390 }, { "epoch": 1.3002134678827868, "grad_norm": 1.4374011148146266, "learning_rate": 2.842170160295931e-05, "loss": 0.556, "step": 13400 }, { "epoch": 1.3011837764409082, "grad_norm": 2.0625833924885075, "learning_rate": 2.8405477318450258e-05, "loss": 0.533, "step": 13410 }, { "epoch": 1.3021540849990296, "grad_norm": 1.7327381778160986, "learning_rate": 2.8389253033941204e-05, "loss": 0.5428, "step": 13420 }, { "epoch": 1.3031243935571513, "grad_norm": 1.5980081031800506, "learning_rate": 2.837302874943215e-05, "loss": 0.5169, "step": 13430 }, { "epoch": 1.3040947021152727, "grad_norm": 1.4133329538382045, "learning_rate": 2.8356804464923097e-05, "loss": 0.5365, "step": 13440 }, { "epoch": 1.305065010673394, "grad_norm": 1.6052634633267397, "learning_rate": 2.8340580180414043e-05, "loss": 0.5062, "step": 13450 }, { "epoch": 1.3060353192315155, "grad_norm": 1.8789186495289818, "learning_rate": 2.8324355895904993e-05, "loss": 0.5301, "step": 13460 }, { "epoch": 1.3070056277896371, "grad_norm": 1.7539318856514114, "learning_rate": 2.830813161139594e-05, "loss": 0.5191, "step": 13470 }, { "epoch": 1.3079759363477585, "grad_norm": 1.652081891063903, "learning_rate": 2.8291907326886886e-05, "loss": 0.5162, "step": 13480 }, { "epoch": 1.3089462449058802, "grad_norm": 1.9042372320107943, "learning_rate": 2.8275683042377832e-05, "loss": 0.5202, "step": 13490 }, { "epoch": 1.3099165534640016, "grad_norm": 1.9149924815733665, "learning_rate": 2.825945875786878e-05, "loss": 0.5785, "step": 13500 }, { "epoch": 1.3099165534640016, "eval_loss": 0.6458503007888794, "eval_runtime": 2472.3879, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.362, "step": 13500 }, { "epoch": 1.310886862022123, "grad_norm": 1.6405768771538807, "learning_rate": 2.8243234473359725e-05, "loss": 0.5037, "step": 13510 }, { "epoch": 1.3118571705802444, "grad_norm": 1.8018934132744455, "learning_rate": 2.822701018885067e-05, "loss": 0.5465, "step": 13520 }, { "epoch": 1.312827479138366, "grad_norm": 2.013531086384291, "learning_rate": 2.8210785904341618e-05, "loss": 0.5011, "step": 13530 }, { "epoch": 1.3137977876964875, "grad_norm": 2.022552703035907, "learning_rate": 2.8194561619832567e-05, "loss": 0.5657, "step": 13540 }, { "epoch": 1.3147680962546089, "grad_norm": 1.734149403661694, "learning_rate": 2.8178337335323514e-05, "loss": 0.5074, "step": 13550 }, { "epoch": 1.3157384048127305, "grad_norm": 2.1508604289357476, "learning_rate": 2.816211305081446e-05, "loss": 0.4873, "step": 13560 }, { "epoch": 1.316708713370852, "grad_norm": 1.9266405893568426, "learning_rate": 2.8145888766305406e-05, "loss": 0.5231, "step": 13570 }, { "epoch": 1.3176790219289733, "grad_norm": 1.8623995402610134, "learning_rate": 2.8129664481796353e-05, "loss": 0.5748, "step": 13580 }, { "epoch": 1.3186493304870948, "grad_norm": 1.5283427217297032, "learning_rate": 2.81134401972873e-05, "loss": 0.5322, "step": 13590 }, { "epoch": 1.3196196390452164, "grad_norm": 1.7148189442620572, "learning_rate": 2.8097215912778246e-05, "loss": 0.4896, "step": 13600 }, { "epoch": 1.3205899476033378, "grad_norm": 1.4473714452022712, "learning_rate": 2.8080991628269192e-05, "loss": 0.4974, "step": 13610 }, { "epoch": 1.3215602561614594, "grad_norm": 1.8643931328762333, "learning_rate": 2.806476734376014e-05, "loss": 0.5707, "step": 13620 }, { "epoch": 1.3225305647195809, "grad_norm": 2.1475243574365264, "learning_rate": 2.8048543059251088e-05, "loss": 0.5167, "step": 13630 }, { "epoch": 1.3235008732777023, "grad_norm": 1.6293038645225775, "learning_rate": 2.8032318774742035e-05, "loss": 0.5268, "step": 13640 }, { "epoch": 1.3244711818358237, "grad_norm": 1.8987021880259773, "learning_rate": 2.801609449023298e-05, "loss": 0.5432, "step": 13650 }, { "epoch": 1.3254414903939453, "grad_norm": 1.5753855821090716, "learning_rate": 2.7999870205723927e-05, "loss": 0.4827, "step": 13660 }, { "epoch": 1.3264117989520667, "grad_norm": 1.350257936732941, "learning_rate": 2.7983645921214874e-05, "loss": 0.5544, "step": 13670 }, { "epoch": 1.3273821075101884, "grad_norm": 1.9293055864878257, "learning_rate": 2.796742163670582e-05, "loss": 0.5441, "step": 13680 }, { "epoch": 1.3283524160683098, "grad_norm": 2.2701614440224813, "learning_rate": 2.7951197352196766e-05, "loss": 0.5237, "step": 13690 }, { "epoch": 1.3293227246264312, "grad_norm": 1.2019327117406085, "learning_rate": 2.7934973067687713e-05, "loss": 0.5434, "step": 13700 }, { "epoch": 1.3302930331845526, "grad_norm": 1.406257025731888, "learning_rate": 2.7918748783178663e-05, "loss": 0.5063, "step": 13710 }, { "epoch": 1.3312633417426742, "grad_norm": 1.666513597838276, "learning_rate": 2.790252449866961e-05, "loss": 0.5068, "step": 13720 }, { "epoch": 1.3322336503007957, "grad_norm": 1.7668183417692156, "learning_rate": 2.7886300214160555e-05, "loss": 0.5141, "step": 13730 }, { "epoch": 1.333203958858917, "grad_norm": 1.433593593379768, "learning_rate": 2.7870075929651502e-05, "loss": 0.4626, "step": 13740 }, { "epoch": 1.3341742674170387, "grad_norm": 2.009965700363568, "learning_rate": 2.7853851645142448e-05, "loss": 0.5618, "step": 13750 }, { "epoch": 1.3351445759751601, "grad_norm": 1.763668408586817, "learning_rate": 2.7837627360633395e-05, "loss": 0.5369, "step": 13760 }, { "epoch": 1.3361148845332815, "grad_norm": 2.1411324301354053, "learning_rate": 2.782140307612434e-05, "loss": 0.5084, "step": 13770 }, { "epoch": 1.337085193091403, "grad_norm": 1.436827138346562, "learning_rate": 2.7805178791615287e-05, "loss": 0.5032, "step": 13780 }, { "epoch": 1.3380555016495246, "grad_norm": 1.9606877958295938, "learning_rate": 2.778895450710624e-05, "loss": 0.5327, "step": 13790 }, { "epoch": 1.339025810207646, "grad_norm": 2.0522393469060396, "learning_rate": 2.7772730222597187e-05, "loss": 0.5398, "step": 13800 }, { "epoch": 1.3399961187657676, "grad_norm": 1.9786107551791654, "learning_rate": 2.7756505938088133e-05, "loss": 0.5009, "step": 13810 }, { "epoch": 1.340966427323889, "grad_norm": 2.090584551627846, "learning_rate": 2.774028165357908e-05, "loss": 0.5296, "step": 13820 }, { "epoch": 1.3419367358820105, "grad_norm": 2.1298950160365613, "learning_rate": 2.772405736907003e-05, "loss": 0.4865, "step": 13830 }, { "epoch": 1.3429070444401319, "grad_norm": 1.7629297128429193, "learning_rate": 2.7707833084560976e-05, "loss": 0.5324, "step": 13840 }, { "epoch": 1.3438773529982535, "grad_norm": 1.8397197756904764, "learning_rate": 2.7691608800051922e-05, "loss": 0.5732, "step": 13850 }, { "epoch": 1.344847661556375, "grad_norm": 1.8108747182833855, "learning_rate": 2.767538451554287e-05, "loss": 0.4947, "step": 13860 }, { "epoch": 1.3458179701144963, "grad_norm": 1.8199614245355178, "learning_rate": 2.7659160231033815e-05, "loss": 0.5557, "step": 13870 }, { "epoch": 1.346788278672618, "grad_norm": 1.3431248134057203, "learning_rate": 2.764293594652476e-05, "loss": 0.5404, "step": 13880 }, { "epoch": 1.3477585872307394, "grad_norm": 1.5574090523973907, "learning_rate": 2.7626711662015708e-05, "loss": 0.568, "step": 13890 }, { "epoch": 1.3487288957888608, "grad_norm": 1.4115013178507176, "learning_rate": 2.7610487377506654e-05, "loss": 0.4716, "step": 13900 }, { "epoch": 1.3496992043469822, "grad_norm": 2.1040010286866444, "learning_rate": 2.75942630929976e-05, "loss": 0.4934, "step": 13910 }, { "epoch": 1.3506695129051038, "grad_norm": 1.7498060957230301, "learning_rate": 2.757803880848855e-05, "loss": 0.5454, "step": 13920 }, { "epoch": 1.3516398214632253, "grad_norm": 1.8910220759501428, "learning_rate": 2.7561814523979497e-05, "loss": 0.5641, "step": 13930 }, { "epoch": 1.352610130021347, "grad_norm": 1.6170015266249442, "learning_rate": 2.7545590239470443e-05, "loss": 0.5378, "step": 13940 }, { "epoch": 1.3535804385794683, "grad_norm": 1.7666178388963019, "learning_rate": 2.752936595496139e-05, "loss": 0.5448, "step": 13950 }, { "epoch": 1.3545507471375897, "grad_norm": 2.052806059730038, "learning_rate": 2.7513141670452336e-05, "loss": 0.5158, "step": 13960 }, { "epoch": 1.3555210556957111, "grad_norm": 1.9222501285118103, "learning_rate": 2.7496917385943282e-05, "loss": 0.4982, "step": 13970 }, { "epoch": 1.3564913642538328, "grad_norm": 2.1370232742150903, "learning_rate": 2.748069310143423e-05, "loss": 0.5409, "step": 13980 }, { "epoch": 1.3574616728119542, "grad_norm": 1.7396628585118512, "learning_rate": 2.7464468816925175e-05, "loss": 0.5327, "step": 13990 }, { "epoch": 1.3584319813700758, "grad_norm": 1.990716927328341, "learning_rate": 2.7448244532416125e-05, "loss": 0.5291, "step": 14000 }, { "epoch": 1.3584319813700758, "eval_loss": 0.6447646021842957, "eval_runtime": 2468.8402, "eval_samples_per_second": 0.726, "eval_steps_per_second": 0.363, "step": 14000 }, { "epoch": 1.3594022899281972, "grad_norm": 1.7121198472067753, "learning_rate": 2.743202024790707e-05, "loss": 0.4758, "step": 14010 }, { "epoch": 1.3603725984863186, "grad_norm": 1.6674489776942332, "learning_rate": 2.7415795963398017e-05, "loss": 0.5484, "step": 14020 }, { "epoch": 1.36134290704444, "grad_norm": 1.458164078535125, "learning_rate": 2.7399571678888964e-05, "loss": 0.5523, "step": 14030 }, { "epoch": 1.3623132156025617, "grad_norm": 1.8775916401381276, "learning_rate": 2.738334739437991e-05, "loss": 0.5254, "step": 14040 }, { "epoch": 1.363283524160683, "grad_norm": 1.671470903392816, "learning_rate": 2.7367123109870857e-05, "loss": 0.5005, "step": 14050 }, { "epoch": 1.3642538327188045, "grad_norm": 1.9806763146120538, "learning_rate": 2.7350898825361803e-05, "loss": 0.5134, "step": 14060 }, { "epoch": 1.3652241412769262, "grad_norm": 1.514946401551448, "learning_rate": 2.733467454085275e-05, "loss": 0.5597, "step": 14070 }, { "epoch": 1.3661944498350476, "grad_norm": 1.6635049106626887, "learning_rate": 2.7318450256343696e-05, "loss": 0.5377, "step": 14080 }, { "epoch": 1.367164758393169, "grad_norm": 1.7564332843089718, "learning_rate": 2.7302225971834645e-05, "loss": 0.5027, "step": 14090 }, { "epoch": 1.3681350669512904, "grad_norm": 2.0454872453162682, "learning_rate": 2.7286001687325592e-05, "loss": 0.5056, "step": 14100 }, { "epoch": 1.369105375509412, "grad_norm": 2.0786210322749032, "learning_rate": 2.7269777402816538e-05, "loss": 0.5835, "step": 14110 }, { "epoch": 1.3700756840675334, "grad_norm": 2.1275215610783555, "learning_rate": 2.7253553118307485e-05, "loss": 0.5235, "step": 14120 }, { "epoch": 1.371045992625655, "grad_norm": 1.7013208723478948, "learning_rate": 2.723732883379843e-05, "loss": 0.5105, "step": 14130 }, { "epoch": 1.3720163011837765, "grad_norm": 1.586441401545233, "learning_rate": 2.7221104549289377e-05, "loss": 0.5221, "step": 14140 }, { "epoch": 1.372986609741898, "grad_norm": 1.533260185383275, "learning_rate": 2.7204880264780324e-05, "loss": 0.4714, "step": 14150 }, { "epoch": 1.3739569183000193, "grad_norm": 1.99495942930363, "learning_rate": 2.718865598027127e-05, "loss": 0.5127, "step": 14160 }, { "epoch": 1.374927226858141, "grad_norm": 1.5455163764317608, "learning_rate": 2.717243169576222e-05, "loss": 0.5331, "step": 14170 }, { "epoch": 1.3758975354162624, "grad_norm": 2.542301246782172, "learning_rate": 2.7156207411253166e-05, "loss": 0.5196, "step": 14180 }, { "epoch": 1.3768678439743838, "grad_norm": 2.367306381136639, "learning_rate": 2.7139983126744113e-05, "loss": 0.5173, "step": 14190 }, { "epoch": 1.3778381525325054, "grad_norm": 1.748922567904988, "learning_rate": 2.712375884223506e-05, "loss": 0.5301, "step": 14200 }, { "epoch": 1.3788084610906268, "grad_norm": 1.9185950864572987, "learning_rate": 2.7107534557726005e-05, "loss": 0.4242, "step": 14210 }, { "epoch": 1.3797787696487482, "grad_norm": 1.7876319295098282, "learning_rate": 2.7091310273216952e-05, "loss": 0.5251, "step": 14220 }, { "epoch": 1.3807490782068697, "grad_norm": 1.7638194028118903, "learning_rate": 2.7075085988707898e-05, "loss": 0.5087, "step": 14230 }, { "epoch": 1.3817193867649913, "grad_norm": 1.781232728742025, "learning_rate": 2.7058861704198845e-05, "loss": 0.46, "step": 14240 }, { "epoch": 1.3826896953231127, "grad_norm": 2.0454750065003564, "learning_rate": 2.704263741968979e-05, "loss": 0.542, "step": 14250 }, { "epoch": 1.3836600038812343, "grad_norm": 2.2368401504756, "learning_rate": 2.702641313518074e-05, "loss": 0.501, "step": 14260 }, { "epoch": 1.3846303124393557, "grad_norm": 1.4717190899560046, "learning_rate": 2.7010188850671687e-05, "loss": 0.5146, "step": 14270 }, { "epoch": 1.3856006209974772, "grad_norm": 1.8752670210283335, "learning_rate": 2.6993964566162634e-05, "loss": 0.4999, "step": 14280 }, { "epoch": 1.3865709295555986, "grad_norm": 2.065234795556922, "learning_rate": 2.697774028165358e-05, "loss": 0.4447, "step": 14290 }, { "epoch": 1.3875412381137202, "grad_norm": 2.125070528749461, "learning_rate": 2.6961515997144526e-05, "loss": 0.4797, "step": 14300 }, { "epoch": 1.3885115466718416, "grad_norm": 1.5859308448291956, "learning_rate": 2.6945291712635473e-05, "loss": 0.5336, "step": 14310 }, { "epoch": 1.3894818552299633, "grad_norm": 2.861319646114515, "learning_rate": 2.692906742812642e-05, "loss": 0.4812, "step": 14320 }, { "epoch": 1.3904521637880847, "grad_norm": 1.7597224976071428, "learning_rate": 2.6912843143617365e-05, "loss": 0.55, "step": 14330 }, { "epoch": 1.391422472346206, "grad_norm": 1.681866583110354, "learning_rate": 2.6896618859108312e-05, "loss": 0.5046, "step": 14340 }, { "epoch": 1.3923927809043275, "grad_norm": 1.5215374726522628, "learning_rate": 2.688039457459926e-05, "loss": 0.5579, "step": 14350 }, { "epoch": 1.3933630894624491, "grad_norm": 2.079812021813028, "learning_rate": 2.6864170290090208e-05, "loss": 0.5495, "step": 14360 }, { "epoch": 1.3943333980205705, "grad_norm": 1.617321032110613, "learning_rate": 2.6847946005581154e-05, "loss": 0.529, "step": 14370 }, { "epoch": 1.395303706578692, "grad_norm": 1.9418293727425169, "learning_rate": 2.68317217210721e-05, "loss": 0.5503, "step": 14380 }, { "epoch": 1.3962740151368136, "grad_norm": 1.6562011692834846, "learning_rate": 2.6815497436563047e-05, "loss": 0.5245, "step": 14390 }, { "epoch": 1.397244323694935, "grad_norm": 1.8991877294518344, "learning_rate": 2.6799273152053994e-05, "loss": 0.4976, "step": 14400 }, { "epoch": 1.3982146322530564, "grad_norm": 1.9301826202288486, "learning_rate": 2.678304886754494e-05, "loss": 0.4878, "step": 14410 }, { "epoch": 1.3991849408111778, "grad_norm": 2.019905342024004, "learning_rate": 2.6766824583035886e-05, "loss": 0.5093, "step": 14420 }, { "epoch": 1.4001552493692995, "grad_norm": 2.364843712084718, "learning_rate": 2.6750600298526836e-05, "loss": 0.5005, "step": 14430 }, { "epoch": 1.4011255579274209, "grad_norm": 1.9751607681902115, "learning_rate": 2.6734376014017782e-05, "loss": 0.4958, "step": 14440 }, { "epoch": 1.4020958664855425, "grad_norm": 1.9625121046714047, "learning_rate": 2.671815172950873e-05, "loss": 0.5251, "step": 14450 }, { "epoch": 1.403066175043664, "grad_norm": 1.4828612193168078, "learning_rate": 2.6701927444999675e-05, "loss": 0.5496, "step": 14460 }, { "epoch": 1.4040364836017853, "grad_norm": 1.5015720843906, "learning_rate": 2.668570316049062e-05, "loss": 0.535, "step": 14470 }, { "epoch": 1.4050067921599068, "grad_norm": 1.3851757562899687, "learning_rate": 2.6669478875981568e-05, "loss": 0.5628, "step": 14480 }, { "epoch": 1.4059771007180284, "grad_norm": 1.6822047277997916, "learning_rate": 2.6653254591472514e-05, "loss": 0.5605, "step": 14490 }, { "epoch": 1.4069474092761498, "grad_norm": 1.9002135762249894, "learning_rate": 2.663703030696346e-05, "loss": 0.5348, "step": 14500 }, { "epoch": 1.4069474092761498, "eval_loss": 0.6422961950302124, "eval_runtime": 2474.9423, "eval_samples_per_second": 0.724, "eval_steps_per_second": 0.362, "step": 14500 }, { "epoch": 1.4079177178342712, "grad_norm": 1.7272883282125664, "learning_rate": 2.6620806022454407e-05, "loss": 0.5518, "step": 14510 }, { "epoch": 1.4088880263923929, "grad_norm": 1.7836182665219666, "learning_rate": 2.6604581737945357e-05, "loss": 0.5195, "step": 14520 }, { "epoch": 1.4098583349505143, "grad_norm": 1.7860780128722327, "learning_rate": 2.6588357453436303e-05, "loss": 0.5061, "step": 14530 }, { "epoch": 1.4108286435086357, "grad_norm": 1.8487543077145125, "learning_rate": 2.657213316892725e-05, "loss": 0.5261, "step": 14540 }, { "epoch": 1.411798952066757, "grad_norm": 1.8960118987299062, "learning_rate": 2.6555908884418196e-05, "loss": 0.523, "step": 14550 }, { "epoch": 1.4127692606248787, "grad_norm": 1.3767948578941858, "learning_rate": 2.6539684599909142e-05, "loss": 0.4654, "step": 14560 }, { "epoch": 1.4137395691830001, "grad_norm": 1.7605168074790472, "learning_rate": 2.652346031540009e-05, "loss": 0.4711, "step": 14570 }, { "epoch": 1.4147098777411218, "grad_norm": 1.424973688569186, "learning_rate": 2.6507236030891042e-05, "loss": 0.5639, "step": 14580 }, { "epoch": 1.4156801862992432, "grad_norm": 1.9215824187230874, "learning_rate": 2.649101174638199e-05, "loss": 0.5137, "step": 14590 }, { "epoch": 1.4166504948573646, "grad_norm": 1.439592415452285, "learning_rate": 2.6474787461872935e-05, "loss": 0.488, "step": 14600 }, { "epoch": 1.417620803415486, "grad_norm": 1.6446340356434774, "learning_rate": 2.645856317736388e-05, "loss": 0.5324, "step": 14610 }, { "epoch": 1.4185911119736077, "grad_norm": 1.9264081435604268, "learning_rate": 2.6442338892854827e-05, "loss": 0.5297, "step": 14620 }, { "epoch": 1.419561420531729, "grad_norm": 1.5948112951701827, "learning_rate": 2.6426114608345777e-05, "loss": 0.5291, "step": 14630 }, { "epoch": 1.4205317290898507, "grad_norm": 1.8654562276618851, "learning_rate": 2.6409890323836724e-05, "loss": 0.5588, "step": 14640 }, { "epoch": 1.4215020376479721, "grad_norm": 1.8410472433514884, "learning_rate": 2.639366603932767e-05, "loss": 0.5167, "step": 14650 }, { "epoch": 1.4224723462060935, "grad_norm": 1.9663683443045321, "learning_rate": 2.6377441754818616e-05, "loss": 0.5166, "step": 14660 }, { "epoch": 1.423442654764215, "grad_norm": 1.77295818801796, "learning_rate": 2.6361217470309563e-05, "loss": 0.5557, "step": 14670 }, { "epoch": 1.4244129633223366, "grad_norm": 1.542816761347417, "learning_rate": 2.634499318580051e-05, "loss": 0.5147, "step": 14680 }, { "epoch": 1.425383271880458, "grad_norm": 2.13234320369828, "learning_rate": 2.6328768901291456e-05, "loss": 0.5421, "step": 14690 }, { "epoch": 1.4263535804385794, "grad_norm": 1.4543036251195376, "learning_rate": 2.6312544616782402e-05, "loss": 0.488, "step": 14700 }, { "epoch": 1.427323888996701, "grad_norm": 1.5267685947307574, "learning_rate": 2.6296320332273348e-05, "loss": 0.5352, "step": 14710 }, { "epoch": 1.4282941975548225, "grad_norm": 2.05420183759504, "learning_rate": 2.6280096047764298e-05, "loss": 0.5489, "step": 14720 }, { "epoch": 1.4292645061129439, "grad_norm": 1.5141763719248076, "learning_rate": 2.6263871763255244e-05, "loss": 0.5627, "step": 14730 }, { "epoch": 1.4302348146710653, "grad_norm": 1.7887916433402153, "learning_rate": 2.624764747874619e-05, "loss": 0.5129, "step": 14740 }, { "epoch": 1.431205123229187, "grad_norm": 1.9817836633125971, "learning_rate": 2.6231423194237137e-05, "loss": 0.5129, "step": 14750 }, { "epoch": 1.4321754317873083, "grad_norm": 2.358539950484514, "learning_rate": 2.6215198909728084e-05, "loss": 0.482, "step": 14760 }, { "epoch": 1.43314574034543, "grad_norm": 1.7444853271955691, "learning_rate": 2.619897462521903e-05, "loss": 0.4948, "step": 14770 }, { "epoch": 1.4341160489035514, "grad_norm": 1.573301988778271, "learning_rate": 2.6182750340709976e-05, "loss": 0.4998, "step": 14780 }, { "epoch": 1.4350863574616728, "grad_norm": 1.878570997397064, "learning_rate": 2.6166526056200923e-05, "loss": 0.489, "step": 14790 }, { "epoch": 1.4360566660197942, "grad_norm": 2.0345633471458444, "learning_rate": 2.615030177169187e-05, "loss": 0.5395, "step": 14800 }, { "epoch": 1.4370269745779158, "grad_norm": 1.7616258348677174, "learning_rate": 2.613407748718282e-05, "loss": 0.5575, "step": 14810 }, { "epoch": 1.4379972831360373, "grad_norm": 2.202827489308336, "learning_rate": 2.6117853202673765e-05, "loss": 0.5378, "step": 14820 }, { "epoch": 1.4389675916941587, "grad_norm": 2.0172801246892496, "learning_rate": 2.610162891816471e-05, "loss": 0.565, "step": 14830 }, { "epoch": 1.4399379002522803, "grad_norm": 1.460733632748616, "learning_rate": 2.6085404633655658e-05, "loss": 0.4866, "step": 14840 }, { "epoch": 1.4409082088104017, "grad_norm": 1.5702235924410088, "learning_rate": 2.6069180349146604e-05, "loss": 0.4814, "step": 14850 }, { "epoch": 1.4418785173685231, "grad_norm": 2.3366982495878013, "learning_rate": 2.605295606463755e-05, "loss": 0.5204, "step": 14860 }, { "epoch": 1.4428488259266445, "grad_norm": 2.0441313559069805, "learning_rate": 2.6036731780128497e-05, "loss": 0.4861, "step": 14870 }, { "epoch": 1.4438191344847662, "grad_norm": 2.2095151787420417, "learning_rate": 2.6020507495619444e-05, "loss": 0.5376, "step": 14880 }, { "epoch": 1.4447894430428876, "grad_norm": 1.9134281253559753, "learning_rate": 2.6004283211110393e-05, "loss": 0.5102, "step": 14890 }, { "epoch": 1.4457597516010092, "grad_norm": 1.7832517565992747, "learning_rate": 2.598805892660134e-05, "loss": 0.5303, "step": 14900 }, { "epoch": 1.4467300601591306, "grad_norm": 1.7558924047130664, "learning_rate": 2.5971834642092286e-05, "loss": 0.4994, "step": 14910 }, { "epoch": 1.447700368717252, "grad_norm": 1.7883915957856114, "learning_rate": 2.5955610357583233e-05, "loss": 0.4677, "step": 14920 }, { "epoch": 1.4486706772753735, "grad_norm": 2.077257189743679, "learning_rate": 2.593938607307418e-05, "loss": 0.515, "step": 14930 }, { "epoch": 1.449640985833495, "grad_norm": 1.9045010104323963, "learning_rate": 2.5923161788565125e-05, "loss": 0.5024, "step": 14940 }, { "epoch": 1.4506112943916165, "grad_norm": 2.106289689638874, "learning_rate": 2.590693750405607e-05, "loss": 0.5242, "step": 14950 }, { "epoch": 1.4515816029497381, "grad_norm": 1.9221754156642648, "learning_rate": 2.5890713219547018e-05, "loss": 0.4712, "step": 14960 }, { "epoch": 1.4525519115078596, "grad_norm": 1.8046920974227167, "learning_rate": 2.5874488935037964e-05, "loss": 0.5017, "step": 14970 }, { "epoch": 1.453522220065981, "grad_norm": 1.855646189779827, "learning_rate": 2.5858264650528914e-05, "loss": 0.5146, "step": 14980 }, { "epoch": 1.4544925286241024, "grad_norm": 1.8308672850602437, "learning_rate": 2.584204036601986e-05, "loss": 0.5293, "step": 14990 }, { "epoch": 1.455462837182224, "grad_norm": 1.6235231403062087, "learning_rate": 2.5825816081510807e-05, "loss": 0.4968, "step": 15000 }, { "epoch": 1.455462837182224, "eval_loss": 0.6395026445388794, "eval_runtime": 2472.4459, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.362, "step": 15000 }, { "epoch": 1.4564331457403454, "grad_norm": 1.9674866651938567, "learning_rate": 2.5809591797001753e-05, "loss": 0.5402, "step": 15010 }, { "epoch": 1.4574034542984668, "grad_norm": 1.7667679072452773, "learning_rate": 2.57933675124927e-05, "loss": 0.5598, "step": 15020 }, { "epoch": 1.4583737628565885, "grad_norm": 1.6236668551346622, "learning_rate": 2.5777143227983646e-05, "loss": 0.4819, "step": 15030 }, { "epoch": 1.45934407141471, "grad_norm": 1.5162521396018838, "learning_rate": 2.5760918943474592e-05, "loss": 0.5609, "step": 15040 }, { "epoch": 1.4603143799728313, "grad_norm": 2.248488105603888, "learning_rate": 2.574469465896554e-05, "loss": 0.4987, "step": 15050 }, { "epoch": 1.4612846885309527, "grad_norm": 1.5876501247710053, "learning_rate": 2.572847037445649e-05, "loss": 0.4794, "step": 15060 }, { "epoch": 1.4622549970890744, "grad_norm": 1.9484672186526921, "learning_rate": 2.5712246089947435e-05, "loss": 0.5115, "step": 15070 }, { "epoch": 1.4632253056471958, "grad_norm": 1.9931906967691606, "learning_rate": 2.569602180543838e-05, "loss": 0.4848, "step": 15080 }, { "epoch": 1.4641956142053174, "grad_norm": 1.9798564517490436, "learning_rate": 2.5679797520929328e-05, "loss": 0.5136, "step": 15090 }, { "epoch": 1.4651659227634388, "grad_norm": 1.7543081259876927, "learning_rate": 2.5663573236420274e-05, "loss": 0.5179, "step": 15100 }, { "epoch": 1.4661362313215602, "grad_norm": 1.797610941181892, "learning_rate": 2.564734895191122e-05, "loss": 0.5134, "step": 15110 }, { "epoch": 1.4671065398796816, "grad_norm": 1.6254067495518276, "learning_rate": 2.5631124667402167e-05, "loss": 0.497, "step": 15120 }, { "epoch": 1.4680768484378033, "grad_norm": 2.0916697354749743, "learning_rate": 2.5614900382893113e-05, "loss": 0.5035, "step": 15130 }, { "epoch": 1.4690471569959247, "grad_norm": 1.7408478320862355, "learning_rate": 2.559867609838406e-05, "loss": 0.5165, "step": 15140 }, { "epoch": 1.470017465554046, "grad_norm": 1.6971971097300078, "learning_rate": 2.558245181387501e-05, "loss": 0.5416, "step": 15150 }, { "epoch": 1.4709877741121677, "grad_norm": 1.7615798102754638, "learning_rate": 2.5566227529365956e-05, "loss": 0.5028, "step": 15160 }, { "epoch": 1.4719580826702892, "grad_norm": 2.126034120697344, "learning_rate": 2.5550003244856902e-05, "loss": 0.4983, "step": 15170 }, { "epoch": 1.4729283912284106, "grad_norm": 1.7930301180931063, "learning_rate": 2.553377896034785e-05, "loss": 0.4836, "step": 15180 }, { "epoch": 1.473898699786532, "grad_norm": 1.945479896374108, "learning_rate": 2.5517554675838795e-05, "loss": 0.4874, "step": 15190 }, { "epoch": 1.4748690083446536, "grad_norm": 1.659537976782885, "learning_rate": 2.550133039132974e-05, "loss": 0.485, "step": 15200 }, { "epoch": 1.475839316902775, "grad_norm": 1.7852824594767274, "learning_rate": 2.5485106106820688e-05, "loss": 0.5314, "step": 15210 }, { "epoch": 1.4768096254608967, "grad_norm": 1.7757412059616349, "learning_rate": 2.5468881822311634e-05, "loss": 0.4906, "step": 15220 }, { "epoch": 1.477779934019018, "grad_norm": 1.7940563485432668, "learning_rate": 2.5452657537802584e-05, "loss": 0.4973, "step": 15230 }, { "epoch": 1.4787502425771395, "grad_norm": 1.8223131394278327, "learning_rate": 2.543643325329353e-05, "loss": 0.5122, "step": 15240 }, { "epoch": 1.479720551135261, "grad_norm": 1.9217304666232693, "learning_rate": 2.5420208968784477e-05, "loss": 0.5497, "step": 15250 }, { "epoch": 1.4806908596933825, "grad_norm": 1.6084194486566938, "learning_rate": 2.5403984684275423e-05, "loss": 0.4971, "step": 15260 }, { "epoch": 1.481661168251504, "grad_norm": 2.1056710345080827, "learning_rate": 2.538776039976637e-05, "loss": 0.5379, "step": 15270 }, { "epoch": 1.4826314768096256, "grad_norm": 2.2545586744739015, "learning_rate": 2.5371536115257316e-05, "loss": 0.4239, "step": 15280 }, { "epoch": 1.483601785367747, "grad_norm": 2.0015642103213063, "learning_rate": 2.5355311830748262e-05, "loss": 0.5283, "step": 15290 }, { "epoch": 1.4845720939258684, "grad_norm": 2.0206628423435387, "learning_rate": 2.533908754623921e-05, "loss": 0.4533, "step": 15300 }, { "epoch": 1.4855424024839898, "grad_norm": 2.0392064010865263, "learning_rate": 2.5322863261730155e-05, "loss": 0.516, "step": 15310 }, { "epoch": 1.4865127110421115, "grad_norm": 1.6397938987534753, "learning_rate": 2.5306638977221105e-05, "loss": 0.4915, "step": 15320 }, { "epoch": 1.4874830196002329, "grad_norm": 2.0519457829568615, "learning_rate": 2.529041469271205e-05, "loss": 0.4938, "step": 15330 }, { "epoch": 1.4884533281583543, "grad_norm": 1.883771979267065, "learning_rate": 2.5274190408202998e-05, "loss": 0.5454, "step": 15340 }, { "epoch": 1.489423636716476, "grad_norm": 1.8963017429804823, "learning_rate": 2.5257966123693944e-05, "loss": 0.4733, "step": 15350 }, { "epoch": 1.4903939452745973, "grad_norm": 2.005144587549119, "learning_rate": 2.5241741839184897e-05, "loss": 0.4967, "step": 15360 }, { "epoch": 1.4913642538327188, "grad_norm": 2.151826141060965, "learning_rate": 2.5225517554675843e-05, "loss": 0.5466, "step": 15370 }, { "epoch": 1.4923345623908402, "grad_norm": 1.6751197267270117, "learning_rate": 2.520929327016679e-05, "loss": 0.5456, "step": 15380 }, { "epoch": 1.4933048709489618, "grad_norm": 1.9009493553059222, "learning_rate": 2.5193068985657736e-05, "loss": 0.483, "step": 15390 }, { "epoch": 1.4942751795070832, "grad_norm": 2.1457921969425757, "learning_rate": 2.5176844701148683e-05, "loss": 0.5458, "step": 15400 }, { "epoch": 1.4952454880652049, "grad_norm": 1.9369152546010822, "learning_rate": 2.516062041663963e-05, "loss": 0.5477, "step": 15410 }, { "epoch": 1.4962157966233263, "grad_norm": 1.9226350314538543, "learning_rate": 2.5144396132130575e-05, "loss": 0.5469, "step": 15420 }, { "epoch": 1.4971861051814477, "grad_norm": 1.768970891771466, "learning_rate": 2.5128171847621522e-05, "loss": 0.4789, "step": 15430 }, { "epoch": 1.498156413739569, "grad_norm": 1.6324753131013463, "learning_rate": 2.511194756311247e-05, "loss": 0.4984, "step": 15440 }, { "epoch": 1.4991267222976907, "grad_norm": 1.5500314116241656, "learning_rate": 2.5095723278603418e-05, "loss": 0.5313, "step": 15450 }, { "epoch": 1.5000970308558121, "grad_norm": 1.8222773979858036, "learning_rate": 2.5079498994094364e-05, "loss": 0.4935, "step": 15460 }, { "epoch": 1.5010673394139338, "grad_norm": 1.6457715681286798, "learning_rate": 2.506327470958531e-05, "loss": 0.4739, "step": 15470 }, { "epoch": 1.5020376479720552, "grad_norm": 1.6810719922194195, "learning_rate": 2.5047050425076257e-05, "loss": 0.478, "step": 15480 }, { "epoch": 1.5030079565301766, "grad_norm": 1.9891968136548206, "learning_rate": 2.5030826140567203e-05, "loss": 0.5365, "step": 15490 }, { "epoch": 1.503978265088298, "grad_norm": 1.836679974736217, "learning_rate": 2.501460185605815e-05, "loss": 0.5497, "step": 15500 }, { "epoch": 1.503978265088298, "eval_loss": 0.6403182148933411, "eval_runtime": 2470.3613, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.363, "step": 15500 }, { "epoch": 1.5049485736464194, "grad_norm": 1.9808212956686209, "learning_rate": 2.4998377571549096e-05, "loss": 0.5195, "step": 15510 }, { "epoch": 1.505918882204541, "grad_norm": 1.696799638403568, "learning_rate": 2.4982153287040043e-05, "loss": 0.481, "step": 15520 }, { "epoch": 1.5068891907626625, "grad_norm": 1.559487018818546, "learning_rate": 2.496592900253099e-05, "loss": 0.4849, "step": 15530 }, { "epoch": 1.5078594993207841, "grad_norm": 2.0106819741019635, "learning_rate": 2.4949704718021935e-05, "loss": 0.5068, "step": 15540 }, { "epoch": 1.5088298078789055, "grad_norm": 1.554558550656832, "learning_rate": 2.4933480433512885e-05, "loss": 0.5542, "step": 15550 }, { "epoch": 1.509800116437027, "grad_norm": 2.1570334538409597, "learning_rate": 2.491725614900383e-05, "loss": 0.5537, "step": 15560 }, { "epoch": 1.5107704249951484, "grad_norm": 2.082418119992958, "learning_rate": 2.4901031864494778e-05, "loss": 0.4495, "step": 15570 }, { "epoch": 1.51174073355327, "grad_norm": 1.7903747855746062, "learning_rate": 2.4884807579985724e-05, "loss": 0.4755, "step": 15580 }, { "epoch": 1.5127110421113914, "grad_norm": 1.8907675365997436, "learning_rate": 2.486858329547667e-05, "loss": 0.5377, "step": 15590 }, { "epoch": 1.513681350669513, "grad_norm": 1.6823616833089232, "learning_rate": 2.4852359010967617e-05, "loss": 0.4747, "step": 15600 }, { "epoch": 1.5146516592276345, "grad_norm": 1.8263050453987495, "learning_rate": 2.4836134726458567e-05, "loss": 0.4962, "step": 15610 }, { "epoch": 1.5156219677857559, "grad_norm": 1.855857638521949, "learning_rate": 2.4819910441949513e-05, "loss": 0.5306, "step": 15620 }, { "epoch": 1.5165922763438773, "grad_norm": 1.7196056977172254, "learning_rate": 2.480368615744046e-05, "loss": 0.507, "step": 15630 }, { "epoch": 1.5175625849019987, "grad_norm": 2.2274627334622354, "learning_rate": 2.4787461872931406e-05, "loss": 0.4741, "step": 15640 }, { "epoch": 1.5185328934601203, "grad_norm": 1.7506693514893716, "learning_rate": 2.4771237588422352e-05, "loss": 0.4512, "step": 15650 }, { "epoch": 1.519503202018242, "grad_norm": 1.5102515980183955, "learning_rate": 2.47550133039133e-05, "loss": 0.5656, "step": 15660 }, { "epoch": 1.5204735105763634, "grad_norm": 1.7638756809733325, "learning_rate": 2.4738789019404245e-05, "loss": 0.5209, "step": 15670 }, { "epoch": 1.5214438191344848, "grad_norm": 1.67804925991046, "learning_rate": 2.472256473489519e-05, "loss": 0.5442, "step": 15680 }, { "epoch": 1.5224141276926062, "grad_norm": 1.6503421070503612, "learning_rate": 2.470634045038614e-05, "loss": 0.5516, "step": 15690 }, { "epoch": 1.5233844362507276, "grad_norm": 1.7388522657721959, "learning_rate": 2.4690116165877088e-05, "loss": 0.4894, "step": 15700 }, { "epoch": 1.5243547448088492, "grad_norm": 1.955414988998346, "learning_rate": 2.4673891881368034e-05, "loss": 0.5493, "step": 15710 }, { "epoch": 1.5253250533669707, "grad_norm": 2.1089020897683324, "learning_rate": 2.465766759685898e-05, "loss": 0.5173, "step": 15720 }, { "epoch": 1.5262953619250923, "grad_norm": 2.1627868553616416, "learning_rate": 2.4641443312349927e-05, "loss": 0.5165, "step": 15730 }, { "epoch": 1.5272656704832137, "grad_norm": 1.9351405181934673, "learning_rate": 2.4625219027840873e-05, "loss": 0.4873, "step": 15740 }, { "epoch": 1.5282359790413351, "grad_norm": 1.8488349390401109, "learning_rate": 2.460899474333182e-05, "loss": 0.5121, "step": 15750 }, { "epoch": 1.5292062875994565, "grad_norm": 1.9814673588646732, "learning_rate": 2.4592770458822766e-05, "loss": 0.4942, "step": 15760 }, { "epoch": 1.530176596157578, "grad_norm": 1.8985936905786858, "learning_rate": 2.4576546174313712e-05, "loss": 0.5227, "step": 15770 }, { "epoch": 1.5311469047156996, "grad_norm": 2.144939705295627, "learning_rate": 2.4560321889804662e-05, "loss": 0.5584, "step": 15780 }, { "epoch": 1.5321172132738212, "grad_norm": 2.196125539672085, "learning_rate": 2.454409760529561e-05, "loss": 0.4592, "step": 15790 }, { "epoch": 1.5330875218319426, "grad_norm": 1.5896290702039733, "learning_rate": 2.4527873320786555e-05, "loss": 0.5183, "step": 15800 }, { "epoch": 1.534057830390064, "grad_norm": 1.3926787086096604, "learning_rate": 2.45116490362775e-05, "loss": 0.4929, "step": 15810 }, { "epoch": 1.5350281389481855, "grad_norm": 1.8947619224673768, "learning_rate": 2.4495424751768448e-05, "loss": 0.487, "step": 15820 }, { "epoch": 1.5359984475063069, "grad_norm": 1.9366715293465946, "learning_rate": 2.4479200467259394e-05, "loss": 0.5126, "step": 15830 }, { "epoch": 1.5369687560644285, "grad_norm": 1.8854258468503662, "learning_rate": 2.446297618275034e-05, "loss": 0.4732, "step": 15840 }, { "epoch": 1.53793906462255, "grad_norm": 1.7897154496692322, "learning_rate": 2.4446751898241287e-05, "loss": 0.4869, "step": 15850 }, { "epoch": 1.5389093731806716, "grad_norm": 1.9691769621461568, "learning_rate": 2.4430527613732233e-05, "loss": 0.5204, "step": 15860 }, { "epoch": 1.539879681738793, "grad_norm": 1.5159729039901195, "learning_rate": 2.4414303329223183e-05, "loss": 0.5067, "step": 15870 }, { "epoch": 1.5408499902969144, "grad_norm": 2.0085743327171364, "learning_rate": 2.439807904471413e-05, "loss": 0.5855, "step": 15880 }, { "epoch": 1.5418202988550358, "grad_norm": 1.808826518921202, "learning_rate": 2.4381854760205076e-05, "loss": 0.511, "step": 15890 }, { "epoch": 1.5427906074131574, "grad_norm": 2.144838816182226, "learning_rate": 2.4365630475696022e-05, "loss": 0.5151, "step": 15900 }, { "epoch": 1.5437609159712788, "grad_norm": 1.9134282869959454, "learning_rate": 2.434940619118697e-05, "loss": 0.547, "step": 15910 }, { "epoch": 1.5447312245294005, "grad_norm": 1.4718585956659067, "learning_rate": 2.4333181906677915e-05, "loss": 0.5035, "step": 15920 }, { "epoch": 1.545701533087522, "grad_norm": 1.9846910792449015, "learning_rate": 2.431695762216886e-05, "loss": 0.4603, "step": 15930 }, { "epoch": 1.5466718416456433, "grad_norm": 2.237689406521008, "learning_rate": 2.430073333765981e-05, "loss": 0.4835, "step": 15940 }, { "epoch": 1.5476421502037647, "grad_norm": 1.8521877806585876, "learning_rate": 2.4284509053150757e-05, "loss": 0.486, "step": 15950 }, { "epoch": 1.5486124587618861, "grad_norm": 1.6410601205984716, "learning_rate": 2.4268284768641704e-05, "loss": 0.5362, "step": 15960 }, { "epoch": 1.5495827673200078, "grad_norm": 2.0223928767474524, "learning_rate": 2.4252060484132653e-05, "loss": 0.5954, "step": 15970 }, { "epoch": 1.5505530758781294, "grad_norm": 2.531432809831189, "learning_rate": 2.42358361996236e-05, "loss": 0.5618, "step": 15980 }, { "epoch": 1.5515233844362508, "grad_norm": 2.0740258540735788, "learning_rate": 2.4219611915114546e-05, "loss": 0.4733, "step": 15990 }, { "epoch": 1.5524936929943722, "grad_norm": 1.7587497399634056, "learning_rate": 2.4203387630605493e-05, "loss": 0.5099, "step": 16000 }, { "epoch": 1.5524936929943722, "eval_loss": 0.635991096496582, "eval_runtime": 2468.0382, "eval_samples_per_second": 0.726, "eval_steps_per_second": 0.363, "step": 16000 }, { "epoch": 1.5534640015524936, "grad_norm": 1.670336846718715, "learning_rate": 2.418716334609644e-05, "loss": 0.4553, "step": 16010 }, { "epoch": 1.554434310110615, "grad_norm": 2.0177796015243237, "learning_rate": 2.4170939061587385e-05, "loss": 0.5115, "step": 16020 }, { "epoch": 1.5554046186687367, "grad_norm": 1.8858883624862002, "learning_rate": 2.4154714777078332e-05, "loss": 0.5172, "step": 16030 }, { "epoch": 1.556374927226858, "grad_norm": 2.0359679981766647, "learning_rate": 2.4138490492569278e-05, "loss": 0.4589, "step": 16040 }, { "epoch": 1.5573452357849797, "grad_norm": 1.2444792958823963, "learning_rate": 2.4122266208060228e-05, "loss": 0.5134, "step": 16050 }, { "epoch": 1.5583155443431012, "grad_norm": 1.5062153000531946, "learning_rate": 2.4106041923551174e-05, "loss": 0.4869, "step": 16060 }, { "epoch": 1.5592858529012226, "grad_norm": 2.2139374575219364, "learning_rate": 2.408981763904212e-05, "loss": 0.4811, "step": 16070 }, { "epoch": 1.560256161459344, "grad_norm": 1.7542756535220294, "learning_rate": 2.4073593354533067e-05, "loss": 0.4895, "step": 16080 }, { "epoch": 1.5612264700174654, "grad_norm": 1.9663056975292839, "learning_rate": 2.4057369070024013e-05, "loss": 0.5046, "step": 16090 }, { "epoch": 1.562196778575587, "grad_norm": 1.760446721510282, "learning_rate": 2.404114478551496e-05, "loss": 0.495, "step": 16100 }, { "epoch": 1.5631670871337087, "grad_norm": 1.6132518828061082, "learning_rate": 2.4024920501005906e-05, "loss": 0.4985, "step": 16110 }, { "epoch": 1.56413739569183, "grad_norm": 1.7640461059330637, "learning_rate": 2.4008696216496853e-05, "loss": 0.5186, "step": 16120 }, { "epoch": 1.5651077042499515, "grad_norm": 1.769814216957158, "learning_rate": 2.39924719319878e-05, "loss": 0.5413, "step": 16130 }, { "epoch": 1.566078012808073, "grad_norm": 2.293241964432802, "learning_rate": 2.397624764747875e-05, "loss": 0.4809, "step": 16140 }, { "epoch": 1.5670483213661943, "grad_norm": 1.8759559371438301, "learning_rate": 2.3960023362969695e-05, "loss": 0.5262, "step": 16150 }, { "epoch": 1.568018629924316, "grad_norm": 2.3087090615314114, "learning_rate": 2.394379907846064e-05, "loss": 0.4841, "step": 16160 }, { "epoch": 1.5689889384824374, "grad_norm": 1.6108810314362396, "learning_rate": 2.3927574793951588e-05, "loss": 0.5188, "step": 16170 }, { "epoch": 1.569959247040559, "grad_norm": 2.241869048530712, "learning_rate": 2.3911350509442534e-05, "loss": 0.5005, "step": 16180 }, { "epoch": 1.5709295555986804, "grad_norm": 2.1127428890150215, "learning_rate": 2.389512622493348e-05, "loss": 0.5063, "step": 16190 }, { "epoch": 1.5718998641568018, "grad_norm": 1.7046555426171013, "learning_rate": 2.3878901940424427e-05, "loss": 0.5154, "step": 16200 }, { "epoch": 1.5728701727149232, "grad_norm": 1.6974188480153705, "learning_rate": 2.3862677655915373e-05, "loss": 0.4775, "step": 16210 }, { "epoch": 1.5738404812730449, "grad_norm": 1.9773772891509165, "learning_rate": 2.384645337140632e-05, "loss": 0.489, "step": 16220 }, { "epoch": 1.5748107898311663, "grad_norm": 2.3323506244488645, "learning_rate": 2.383022908689727e-05, "loss": 0.5252, "step": 16230 }, { "epoch": 1.575781098389288, "grad_norm": 1.4580240462073564, "learning_rate": 2.3814004802388216e-05, "loss": 0.4809, "step": 16240 }, { "epoch": 1.5767514069474093, "grad_norm": 1.9799244985970428, "learning_rate": 2.3797780517879162e-05, "loss": 0.4938, "step": 16250 }, { "epoch": 1.5777217155055308, "grad_norm": 2.2967773637028315, "learning_rate": 2.378155623337011e-05, "loss": 0.48, "step": 16260 }, { "epoch": 1.5786920240636522, "grad_norm": 1.6715177397086511, "learning_rate": 2.3765331948861055e-05, "loss": 0.5112, "step": 16270 }, { "epoch": 1.5796623326217736, "grad_norm": 1.7543224867437088, "learning_rate": 2.3749107664352e-05, "loss": 0.5355, "step": 16280 }, { "epoch": 1.5806326411798952, "grad_norm": 1.7059823329477148, "learning_rate": 2.3732883379842948e-05, "loss": 0.4885, "step": 16290 }, { "epoch": 1.5816029497380169, "grad_norm": 1.808906067511985, "learning_rate": 2.3716659095333894e-05, "loss": 0.4883, "step": 16300 }, { "epoch": 1.5825732582961383, "grad_norm": 1.456280506138187, "learning_rate": 2.3700434810824844e-05, "loss": 0.4986, "step": 16310 }, { "epoch": 1.5835435668542597, "grad_norm": 1.7844142401171645, "learning_rate": 2.368421052631579e-05, "loss": 0.5726, "step": 16320 }, { "epoch": 1.584513875412381, "grad_norm": 1.8511812344459693, "learning_rate": 2.3667986241806737e-05, "loss": 0.5418, "step": 16330 }, { "epoch": 1.5854841839705025, "grad_norm": 1.6886781249735945, "learning_rate": 2.3651761957297687e-05, "loss": 0.4871, "step": 16340 }, { "epoch": 1.5864544925286241, "grad_norm": 1.6386080299672316, "learning_rate": 2.3635537672788633e-05, "loss": 0.5502, "step": 16350 }, { "epoch": 1.5874248010867456, "grad_norm": 1.6152930914041828, "learning_rate": 2.361931338827958e-05, "loss": 0.4602, "step": 16360 }, { "epoch": 1.5883951096448672, "grad_norm": 1.9896290423806842, "learning_rate": 2.3603089103770526e-05, "loss": 0.5517, "step": 16370 }, { "epoch": 1.5893654182029886, "grad_norm": 1.785358677833909, "learning_rate": 2.3586864819261472e-05, "loss": 0.5108, "step": 16380 }, { "epoch": 1.59033572676111, "grad_norm": 1.8919452155428813, "learning_rate": 2.357064053475242e-05, "loss": 0.5586, "step": 16390 }, { "epoch": 1.5913060353192314, "grad_norm": 2.1765937296152376, "learning_rate": 2.3554416250243365e-05, "loss": 0.4978, "step": 16400 }, { "epoch": 1.5922763438773528, "grad_norm": 1.9463613536174769, "learning_rate": 2.3538191965734315e-05, "loss": 0.534, "step": 16410 }, { "epoch": 1.5932466524354745, "grad_norm": 1.483428953982187, "learning_rate": 2.352196768122526e-05, "loss": 0.4481, "step": 16420 }, { "epoch": 1.594216960993596, "grad_norm": 1.8946130151442728, "learning_rate": 2.3505743396716207e-05, "loss": 0.4499, "step": 16430 }, { "epoch": 1.5951872695517175, "grad_norm": 1.496172632655443, "learning_rate": 2.3489519112207154e-05, "loss": 0.4719, "step": 16440 }, { "epoch": 1.596157578109839, "grad_norm": 1.5582678654830617, "learning_rate": 2.34732948276981e-05, "loss": 0.5023, "step": 16450 }, { "epoch": 1.5971278866679604, "grad_norm": 1.9101490466966036, "learning_rate": 2.3457070543189047e-05, "loss": 0.4442, "step": 16460 }, { "epoch": 1.5980981952260818, "grad_norm": 2.5712850986928757, "learning_rate": 2.3440846258679993e-05, "loss": 0.5008, "step": 16470 }, { "epoch": 1.5990685037842034, "grad_norm": 2.1177587573552805, "learning_rate": 2.342462197417094e-05, "loss": 0.5014, "step": 16480 }, { "epoch": 1.6000388123423248, "grad_norm": 2.4990786716780455, "learning_rate": 2.3408397689661886e-05, "loss": 0.4583, "step": 16490 }, { "epoch": 1.6010091209004464, "grad_norm": 1.91427095412827, "learning_rate": 2.3392173405152835e-05, "loss": 0.5037, "step": 16500 }, { "epoch": 1.6010091209004464, "eval_loss": 0.6359612345695496, "eval_runtime": 2467.4852, "eval_samples_per_second": 0.726, "eval_steps_per_second": 0.363, "step": 16500 }, { "epoch": 1.6019794294585679, "grad_norm": 1.8370649227416649, "learning_rate": 2.3375949120643782e-05, "loss": 0.5399, "step": 16510 }, { "epoch": 1.6029497380166893, "grad_norm": 1.6932084838539212, "learning_rate": 2.3359724836134728e-05, "loss": 0.5212, "step": 16520 }, { "epoch": 1.6039200465748107, "grad_norm": 1.68790954641985, "learning_rate": 2.3343500551625675e-05, "loss": 0.4837, "step": 16530 }, { "epoch": 1.6048903551329323, "grad_norm": 1.5773573029174093, "learning_rate": 2.332727626711662e-05, "loss": 0.5125, "step": 16540 }, { "epoch": 1.6058606636910537, "grad_norm": 1.7640635185794997, "learning_rate": 2.3311051982607567e-05, "loss": 0.5013, "step": 16550 }, { "epoch": 1.6068309722491754, "grad_norm": 1.7477056146726457, "learning_rate": 2.3294827698098514e-05, "loss": 0.5095, "step": 16560 }, { "epoch": 1.6078012808072968, "grad_norm": 1.0733428522302542, "learning_rate": 2.327860341358946e-05, "loss": 0.5, "step": 16570 }, { "epoch": 1.6087715893654182, "grad_norm": 1.9564292094193194, "learning_rate": 2.326237912908041e-05, "loss": 0.5058, "step": 16580 }, { "epoch": 1.6097418979235396, "grad_norm": 2.1045720941333466, "learning_rate": 2.3246154844571356e-05, "loss": 0.5101, "step": 16590 }, { "epoch": 1.610712206481661, "grad_norm": 2.171187966701804, "learning_rate": 2.3229930560062303e-05, "loss": 0.5069, "step": 16600 }, { "epoch": 1.6116825150397827, "grad_norm": 1.9610313890988438, "learning_rate": 2.321370627555325e-05, "loss": 0.4904, "step": 16610 }, { "epoch": 1.6126528235979043, "grad_norm": 1.941252888287077, "learning_rate": 2.3197481991044195e-05, "loss": 0.4865, "step": 16620 }, { "epoch": 1.6136231321560257, "grad_norm": 1.897218836962928, "learning_rate": 2.3181257706535142e-05, "loss": 0.4966, "step": 16630 }, { "epoch": 1.6145934407141471, "grad_norm": 2.201820668610594, "learning_rate": 2.3165033422026088e-05, "loss": 0.4895, "step": 16640 }, { "epoch": 1.6155637492722685, "grad_norm": 1.5720192215380684, "learning_rate": 2.3148809137517035e-05, "loss": 0.4918, "step": 16650 }, { "epoch": 1.61653405783039, "grad_norm": 2.227704713223705, "learning_rate": 2.313258485300798e-05, "loss": 0.5251, "step": 16660 }, { "epoch": 1.6175043663885116, "grad_norm": 2.0170271312798707, "learning_rate": 2.311636056849893e-05, "loss": 0.4965, "step": 16670 }, { "epoch": 1.618474674946633, "grad_norm": 1.9688156858912216, "learning_rate": 2.3100136283989877e-05, "loss": 0.4685, "step": 16680 }, { "epoch": 1.6194449835047546, "grad_norm": 2.0958524562959675, "learning_rate": 2.3083911999480824e-05, "loss": 0.4884, "step": 16690 }, { "epoch": 1.620415292062876, "grad_norm": 2.0973339157843447, "learning_rate": 2.306768771497177e-05, "loss": 0.4394, "step": 16700 }, { "epoch": 1.6213856006209975, "grad_norm": 1.5852203386480506, "learning_rate": 2.3051463430462716e-05, "loss": 0.5068, "step": 16710 }, { "epoch": 1.6223559091791189, "grad_norm": 1.907182215766834, "learning_rate": 2.3035239145953663e-05, "loss": 0.461, "step": 16720 }, { "epoch": 1.6233262177372403, "grad_norm": 1.9903511844679715, "learning_rate": 2.3019014861444612e-05, "loss": 0.4861, "step": 16730 }, { "epoch": 1.624296526295362, "grad_norm": 1.8734845445192632, "learning_rate": 2.300279057693556e-05, "loss": 0.5203, "step": 16740 }, { "epoch": 1.6252668348534836, "grad_norm": 1.4830887784438085, "learning_rate": 2.2986566292426505e-05, "loss": 0.5504, "step": 16750 }, { "epoch": 1.626237143411605, "grad_norm": 1.9852817046238187, "learning_rate": 2.297034200791745e-05, "loss": 0.5727, "step": 16760 }, { "epoch": 1.6272074519697264, "grad_norm": 1.9574779506866022, "learning_rate": 2.29541177234084e-05, "loss": 0.4932, "step": 16770 }, { "epoch": 1.6281777605278478, "grad_norm": 1.8045639100087034, "learning_rate": 2.2937893438899348e-05, "loss": 0.4874, "step": 16780 }, { "epoch": 1.6291480690859692, "grad_norm": 1.9467881794111201, "learning_rate": 2.2921669154390294e-05, "loss": 0.4231, "step": 16790 }, { "epoch": 1.6301183776440908, "grad_norm": 2.115958262042766, "learning_rate": 2.290544486988124e-05, "loss": 0.5085, "step": 16800 }, { "epoch": 1.6310886862022123, "grad_norm": 1.9242055352600025, "learning_rate": 2.2889220585372187e-05, "loss": 0.4961, "step": 16810 }, { "epoch": 1.632058994760334, "grad_norm": 1.7473969264432643, "learning_rate": 2.2872996300863133e-05, "loss": 0.4896, "step": 16820 }, { "epoch": 1.6330293033184553, "grad_norm": 1.9086620383723776, "learning_rate": 2.285677201635408e-05, "loss": 0.5793, "step": 16830 }, { "epoch": 1.6339996118765767, "grad_norm": 1.8179786231819581, "learning_rate": 2.2840547731845026e-05, "loss": 0.47, "step": 16840 }, { "epoch": 1.6349699204346981, "grad_norm": 2.3742959935176993, "learning_rate": 2.2824323447335972e-05, "loss": 0.4601, "step": 16850 }, { "epoch": 1.6359402289928198, "grad_norm": 1.5831090068828182, "learning_rate": 2.2808099162826922e-05, "loss": 0.527, "step": 16860 }, { "epoch": 1.6369105375509412, "grad_norm": 2.429466466556868, "learning_rate": 2.279187487831787e-05, "loss": 0.5041, "step": 16870 }, { "epoch": 1.6378808461090628, "grad_norm": 2.0666827078950676, "learning_rate": 2.2775650593808815e-05, "loss": 0.5093, "step": 16880 }, { "epoch": 1.6388511546671842, "grad_norm": 1.7376049636619024, "learning_rate": 2.275942630929976e-05, "loss": 0.4899, "step": 16890 }, { "epoch": 1.6398214632253056, "grad_norm": 1.9743952968077716, "learning_rate": 2.2743202024790708e-05, "loss": 0.4483, "step": 16900 }, { "epoch": 1.640791771783427, "grad_norm": 1.7020982614990035, "learning_rate": 2.2726977740281654e-05, "loss": 0.4746, "step": 16910 }, { "epoch": 1.6417620803415485, "grad_norm": 1.5288620918579647, "learning_rate": 2.27107534557726e-05, "loss": 0.5386, "step": 16920 }, { "epoch": 1.64273238889967, "grad_norm": 2.2418459177194725, "learning_rate": 2.2694529171263547e-05, "loss": 0.5221, "step": 16930 }, { "epoch": 1.6437026974577917, "grad_norm": 1.7801442444217204, "learning_rate": 2.2678304886754497e-05, "loss": 0.502, "step": 16940 }, { "epoch": 1.6446730060159132, "grad_norm": 2.750428317407271, "learning_rate": 2.2662080602245443e-05, "loss": 0.4947, "step": 16950 }, { "epoch": 1.6456433145740346, "grad_norm": 1.5310655899660852, "learning_rate": 2.264585631773639e-05, "loss": 0.5228, "step": 16960 }, { "epoch": 1.646613623132156, "grad_norm": 1.7325660800102027, "learning_rate": 2.2629632033227336e-05, "loss": 0.4724, "step": 16970 }, { "epoch": 1.6475839316902774, "grad_norm": 3.028000821794418, "learning_rate": 2.2613407748718282e-05, "loss": 0.5444, "step": 16980 }, { "epoch": 1.648554240248399, "grad_norm": 1.8847091529095503, "learning_rate": 2.259718346420923e-05, "loss": 0.5095, "step": 16990 }, { "epoch": 1.6495245488065204, "grad_norm": 1.584479445600283, "learning_rate": 2.2580959179700175e-05, "loss": 0.5471, "step": 17000 }, { "epoch": 1.6495245488065204, "eval_loss": 0.6354050636291504, "eval_runtime": 2468.5352, "eval_samples_per_second": 0.726, "eval_steps_per_second": 0.363, "step": 17000 }, { "epoch": 1.650494857364642, "grad_norm": 1.668904824652418, "learning_rate": 2.256473489519112e-05, "loss": 0.4838, "step": 17010 }, { "epoch": 1.6514651659227635, "grad_norm": 1.7782439469811153, "learning_rate": 2.2548510610682068e-05, "loss": 0.5225, "step": 17020 }, { "epoch": 1.652435474480885, "grad_norm": 2.1175538832822576, "learning_rate": 2.2532286326173017e-05, "loss": 0.5438, "step": 17030 }, { "epoch": 1.6534057830390063, "grad_norm": 1.9031650594744414, "learning_rate": 2.2516062041663964e-05, "loss": 0.4456, "step": 17040 }, { "epoch": 1.6543760915971277, "grad_norm": 1.8418596497226953, "learning_rate": 2.249983775715491e-05, "loss": 0.5148, "step": 17050 }, { "epoch": 1.6553464001552494, "grad_norm": 2.001058155829245, "learning_rate": 2.2483613472645857e-05, "loss": 0.5381, "step": 17060 }, { "epoch": 1.656316708713371, "grad_norm": 1.6070553409553883, "learning_rate": 2.2467389188136803e-05, "loss": 0.5199, "step": 17070 }, { "epoch": 1.6572870172714924, "grad_norm": 1.8977480591408646, "learning_rate": 2.245116490362775e-05, "loss": 0.5132, "step": 17080 }, { "epoch": 1.6582573258296138, "grad_norm": 1.8011894993916608, "learning_rate": 2.2434940619118696e-05, "loss": 0.5448, "step": 17090 }, { "epoch": 1.6592276343877352, "grad_norm": 2.0284255015217565, "learning_rate": 2.2418716334609642e-05, "loss": 0.4717, "step": 17100 }, { "epoch": 1.6601979429458567, "grad_norm": 1.885999888526123, "learning_rate": 2.2402492050100592e-05, "loss": 0.5183, "step": 17110 }, { "epoch": 1.6611682515039783, "grad_norm": 1.9056970812089797, "learning_rate": 2.2386267765591538e-05, "loss": 0.4393, "step": 17120 }, { "epoch": 1.6621385600620997, "grad_norm": 2.267841170403769, "learning_rate": 2.2370043481082488e-05, "loss": 0.4882, "step": 17130 }, { "epoch": 1.6631088686202213, "grad_norm": 1.5488822989395437, "learning_rate": 2.2353819196573434e-05, "loss": 0.5294, "step": 17140 }, { "epoch": 1.6640791771783427, "grad_norm": 1.992034372217963, "learning_rate": 2.233759491206438e-05, "loss": 0.4925, "step": 17150 }, { "epoch": 1.6650494857364642, "grad_norm": 1.7050665334433257, "learning_rate": 2.2321370627555327e-05, "loss": 0.5237, "step": 17160 }, { "epoch": 1.6660197942945856, "grad_norm": 1.8164813364072325, "learning_rate": 2.2305146343046274e-05, "loss": 0.5128, "step": 17170 }, { "epoch": 1.6669901028527072, "grad_norm": 1.7407964429381908, "learning_rate": 2.228892205853722e-05, "loss": 0.4812, "step": 17180 }, { "epoch": 1.6679604114108286, "grad_norm": 2.220803029174025, "learning_rate": 2.2272697774028166e-05, "loss": 0.4936, "step": 17190 }, { "epoch": 1.6689307199689503, "grad_norm": 1.7697338691239146, "learning_rate": 2.2256473489519113e-05, "loss": 0.5208, "step": 17200 }, { "epoch": 1.6699010285270717, "grad_norm": 2.2487670732225085, "learning_rate": 2.224024920501006e-05, "loss": 0.4833, "step": 17210 }, { "epoch": 1.670871337085193, "grad_norm": 1.888912866564453, "learning_rate": 2.222402492050101e-05, "loss": 0.5287, "step": 17220 }, { "epoch": 1.6718416456433145, "grad_norm": 1.7902985235611324, "learning_rate": 2.2207800635991955e-05, "loss": 0.5271, "step": 17230 }, { "epoch": 1.672811954201436, "grad_norm": 2.2626922106136607, "learning_rate": 2.21915763514829e-05, "loss": 0.5062, "step": 17240 }, { "epoch": 1.6737822627595575, "grad_norm": 2.0509489741448923, "learning_rate": 2.2175352066973848e-05, "loss": 0.4586, "step": 17250 }, { "epoch": 1.6747525713176792, "grad_norm": 1.7028309180471788, "learning_rate": 2.2159127782464794e-05, "loss": 0.5591, "step": 17260 }, { "epoch": 1.6757228798758006, "grad_norm": 1.8148060135409945, "learning_rate": 2.214290349795574e-05, "loss": 0.4708, "step": 17270 }, { "epoch": 1.676693188433922, "grad_norm": 1.7415357367009103, "learning_rate": 2.2126679213446687e-05, "loss": 0.4877, "step": 17280 }, { "epoch": 1.6776634969920434, "grad_norm": 1.6879714610876217, "learning_rate": 2.2110454928937634e-05, "loss": 0.4838, "step": 17290 }, { "epoch": 1.6786338055501648, "grad_norm": 1.3869597913004987, "learning_rate": 2.2094230644428583e-05, "loss": 0.496, "step": 17300 }, { "epoch": 1.6796041141082865, "grad_norm": 1.4730335934961858, "learning_rate": 2.207800635991953e-05, "loss": 0.4926, "step": 17310 }, { "epoch": 1.6805744226664079, "grad_norm": 1.771999456693497, "learning_rate": 2.2061782075410476e-05, "loss": 0.4993, "step": 17320 }, { "epoch": 1.6815447312245295, "grad_norm": 1.6689193723022826, "learning_rate": 2.2045557790901423e-05, "loss": 0.4815, "step": 17330 }, { "epoch": 1.682515039782651, "grad_norm": 2.1543699729043717, "learning_rate": 2.202933350639237e-05, "loss": 0.4772, "step": 17340 }, { "epoch": 1.6834853483407723, "grad_norm": 1.7964447035669882, "learning_rate": 2.2013109221883315e-05, "loss": 0.4872, "step": 17350 }, { "epoch": 1.6844556568988938, "grad_norm": 1.5499702179216124, "learning_rate": 2.199688493737426e-05, "loss": 0.5115, "step": 17360 }, { "epoch": 1.6854259654570152, "grad_norm": 1.736034409254174, "learning_rate": 2.1980660652865208e-05, "loss": 0.5004, "step": 17370 }, { "epoch": 1.6863962740151368, "grad_norm": 1.6670883800573928, "learning_rate": 2.1964436368356154e-05, "loss": 0.5172, "step": 17380 }, { "epoch": 1.6873665825732584, "grad_norm": 1.8976878178656118, "learning_rate": 2.1948212083847104e-05, "loss": 0.5056, "step": 17390 }, { "epoch": 1.6883368911313799, "grad_norm": 1.7825783485749558, "learning_rate": 2.193198779933805e-05, "loss": 0.5079, "step": 17400 }, { "epoch": 1.6893071996895013, "grad_norm": 1.963783922685446, "learning_rate": 2.1915763514828997e-05, "loss": 0.5595, "step": 17410 }, { "epoch": 1.6902775082476227, "grad_norm": 1.577100524669314, "learning_rate": 2.1899539230319943e-05, "loss": 0.5122, "step": 17420 }, { "epoch": 1.691247816805744, "grad_norm": 1.9224227001223828, "learning_rate": 2.188331494581089e-05, "loss": 0.5179, "step": 17430 }, { "epoch": 1.6922181253638657, "grad_norm": 1.968807335226898, "learning_rate": 2.1867090661301836e-05, "loss": 0.4811, "step": 17440 }, { "epoch": 1.6931884339219871, "grad_norm": 1.8656524316794496, "learning_rate": 2.1850866376792782e-05, "loss": 0.5562, "step": 17450 }, { "epoch": 1.6941587424801088, "grad_norm": 1.936995738098722, "learning_rate": 2.183464209228373e-05, "loss": 0.5128, "step": 17460 }, { "epoch": 1.6951290510382302, "grad_norm": 1.716335988732862, "learning_rate": 2.181841780777468e-05, "loss": 0.4878, "step": 17470 }, { "epoch": 1.6960993595963516, "grad_norm": 1.8671035370037838, "learning_rate": 2.1802193523265625e-05, "loss": 0.5631, "step": 17480 }, { "epoch": 1.697069668154473, "grad_norm": 2.10031146751523, "learning_rate": 2.178596923875657e-05, "loss": 0.4911, "step": 17490 }, { "epoch": 1.6980399767125947, "grad_norm": 1.4651955164387034, "learning_rate": 2.1769744954247518e-05, "loss": 0.4841, "step": 17500 }, { "epoch": 1.6980399767125947, "eval_loss": 0.6339951157569885, "eval_runtime": 2473.3585, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.362, "step": 17500 }, { "epoch": 1.699010285270716, "grad_norm": 1.9647331919080824, "learning_rate": 2.1753520669738468e-05, "loss": 0.4788, "step": 17510 }, { "epoch": 1.6999805938288377, "grad_norm": 1.9100412693293238, "learning_rate": 2.1737296385229414e-05, "loss": 0.475, "step": 17520 }, { "epoch": 1.7009509023869591, "grad_norm": 1.7020406495492606, "learning_rate": 2.172107210072036e-05, "loss": 0.4908, "step": 17530 }, { "epoch": 1.7019212109450805, "grad_norm": 1.4911580170253027, "learning_rate": 2.1704847816211307e-05, "loss": 0.4832, "step": 17540 }, { "epoch": 1.702891519503202, "grad_norm": 2.086292759031188, "learning_rate": 2.1688623531702253e-05, "loss": 0.5129, "step": 17550 }, { "epoch": 1.7038618280613234, "grad_norm": 1.5359738880009668, "learning_rate": 2.16723992471932e-05, "loss": 0.4512, "step": 17560 }, { "epoch": 1.704832136619445, "grad_norm": 1.850242190602371, "learning_rate": 2.165617496268415e-05, "loss": 0.5224, "step": 17570 }, { "epoch": 1.7058024451775666, "grad_norm": 1.2439323678887082, "learning_rate": 2.1639950678175096e-05, "loss": 0.4625, "step": 17580 }, { "epoch": 1.706772753735688, "grad_norm": 1.5186947478659967, "learning_rate": 2.1623726393666042e-05, "loss": 0.5527, "step": 17590 }, { "epoch": 1.7077430622938095, "grad_norm": 2.1869553971014035, "learning_rate": 2.160750210915699e-05, "loss": 0.499, "step": 17600 }, { "epoch": 1.7087133708519309, "grad_norm": 2.1896368464295968, "learning_rate": 2.1591277824647935e-05, "loss": 0.5182, "step": 17610 }, { "epoch": 1.7096836794100523, "grad_norm": 2.5508627899244853, "learning_rate": 2.157505354013888e-05, "loss": 0.529, "step": 17620 }, { "epoch": 1.710653987968174, "grad_norm": 2.0451418684052527, "learning_rate": 2.1558829255629828e-05, "loss": 0.4789, "step": 17630 }, { "epoch": 1.7116242965262953, "grad_norm": 2.358852269881603, "learning_rate": 2.1542604971120774e-05, "loss": 0.4701, "step": 17640 }, { "epoch": 1.712594605084417, "grad_norm": 1.65876380363299, "learning_rate": 2.152638068661172e-05, "loss": 0.5078, "step": 17650 }, { "epoch": 1.7135649136425384, "grad_norm": 1.819260137292149, "learning_rate": 2.151015640210267e-05, "loss": 0.5115, "step": 17660 }, { "epoch": 1.7145352222006598, "grad_norm": 1.9278786735506124, "learning_rate": 2.1493932117593616e-05, "loss": 0.526, "step": 17670 }, { "epoch": 1.7155055307587812, "grad_norm": 1.647908501988862, "learning_rate": 2.1477707833084563e-05, "loss": 0.4455, "step": 17680 }, { "epoch": 1.7164758393169026, "grad_norm": 2.211325829098572, "learning_rate": 2.146148354857551e-05, "loss": 0.5251, "step": 17690 }, { "epoch": 1.7174461478750243, "grad_norm": 2.068602886260178, "learning_rate": 2.1445259264066456e-05, "loss": 0.5173, "step": 17700 }, { "epoch": 1.718416456433146, "grad_norm": 1.7976369940318933, "learning_rate": 2.1429034979557402e-05, "loss": 0.4997, "step": 17710 }, { "epoch": 1.7193867649912673, "grad_norm": 1.8518665259593716, "learning_rate": 2.141281069504835e-05, "loss": 0.5221, "step": 17720 }, { "epoch": 1.7203570735493887, "grad_norm": 2.162010082710502, "learning_rate": 2.1396586410539295e-05, "loss": 0.5096, "step": 17730 }, { "epoch": 1.7213273821075101, "grad_norm": 2.3175061118466704, "learning_rate": 2.138036212603024e-05, "loss": 0.5203, "step": 17740 }, { "epoch": 1.7222976906656315, "grad_norm": 2.0036851029306364, "learning_rate": 2.136413784152119e-05, "loss": 0.5312, "step": 17750 }, { "epoch": 1.7232679992237532, "grad_norm": 2.232799745735125, "learning_rate": 2.1347913557012137e-05, "loss": 0.477, "step": 17760 }, { "epoch": 1.7242383077818746, "grad_norm": 2.0579250461700225, "learning_rate": 2.1331689272503084e-05, "loss": 0.5104, "step": 17770 }, { "epoch": 1.7252086163399962, "grad_norm": 1.9067412132990194, "learning_rate": 2.131546498799403e-05, "loss": 0.4663, "step": 17780 }, { "epoch": 1.7261789248981176, "grad_norm": 1.6166777364493794, "learning_rate": 2.1299240703484976e-05, "loss": 0.5091, "step": 17790 }, { "epoch": 1.727149233456239, "grad_norm": 1.8400065423176315, "learning_rate": 2.1283016418975923e-05, "loss": 0.4577, "step": 17800 }, { "epoch": 1.7281195420143605, "grad_norm": 1.8343636819724598, "learning_rate": 2.126679213446687e-05, "loss": 0.5171, "step": 17810 }, { "epoch": 1.729089850572482, "grad_norm": 2.035066789255769, "learning_rate": 2.1250567849957816e-05, "loss": 0.4809, "step": 17820 }, { "epoch": 1.7300601591306035, "grad_norm": 1.9750470965118032, "learning_rate": 2.1234343565448765e-05, "loss": 0.4681, "step": 17830 }, { "epoch": 1.7310304676887251, "grad_norm": 1.707449728088738, "learning_rate": 2.1218119280939712e-05, "loss": 0.4768, "step": 17840 }, { "epoch": 1.7320007762468466, "grad_norm": 2.091683187172791, "learning_rate": 2.1201894996430658e-05, "loss": 0.4967, "step": 17850 }, { "epoch": 1.732971084804968, "grad_norm": 1.8317230891625513, "learning_rate": 2.1185670711921604e-05, "loss": 0.4617, "step": 17860 }, { "epoch": 1.7339413933630894, "grad_norm": 1.8581590049450867, "learning_rate": 2.116944642741255e-05, "loss": 0.453, "step": 17870 }, { "epoch": 1.7349117019212108, "grad_norm": 2.394138871900483, "learning_rate": 2.1153222142903497e-05, "loss": 0.5231, "step": 17880 }, { "epoch": 1.7358820104793324, "grad_norm": 1.8303455092887013, "learning_rate": 2.1136997858394444e-05, "loss": 0.5045, "step": 17890 }, { "epoch": 1.736852319037454, "grad_norm": 1.7040025873966649, "learning_rate": 2.1120773573885393e-05, "loss": 0.5185, "step": 17900 }, { "epoch": 1.7378226275955755, "grad_norm": 1.721795865054807, "learning_rate": 2.110454928937634e-05, "loss": 0.5153, "step": 17910 }, { "epoch": 1.738792936153697, "grad_norm": 2.195542603436347, "learning_rate": 2.1088325004867286e-05, "loss": 0.515, "step": 17920 }, { "epoch": 1.7397632447118183, "grad_norm": 1.9475497865392444, "learning_rate": 2.1072100720358236e-05, "loss": 0.6134, "step": 17930 }, { "epoch": 1.7407335532699397, "grad_norm": 2.007827581925297, "learning_rate": 2.1055876435849182e-05, "loss": 0.4483, "step": 17940 }, { "epoch": 1.7417038618280614, "grad_norm": 2.427289180607889, "learning_rate": 2.103965215134013e-05, "loss": 0.4791, "step": 17950 }, { "epoch": 1.7426741703861828, "grad_norm": 2.090548009394418, "learning_rate": 2.1023427866831075e-05, "loss": 0.4485, "step": 17960 }, { "epoch": 1.7436444789443044, "grad_norm": 1.4231148878110762, "learning_rate": 2.100720358232202e-05, "loss": 0.4819, "step": 17970 }, { "epoch": 1.7446147875024258, "grad_norm": 1.952695177483555, "learning_rate": 2.0990979297812968e-05, "loss": 0.4615, "step": 17980 }, { "epoch": 1.7455850960605472, "grad_norm": 1.5076062020543768, "learning_rate": 2.0974755013303914e-05, "loss": 0.5145, "step": 17990 }, { "epoch": 1.7465554046186686, "grad_norm": 2.385317508868708, "learning_rate": 2.095853072879486e-05, "loss": 0.4667, "step": 18000 }, { "epoch": 1.7465554046186686, "eval_loss": 0.6351094841957092, "eval_runtime": 2471.0101, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.363, "step": 18000 }, { "epoch": 1.74752571317679, "grad_norm": 1.5204879350818516, "learning_rate": 2.0942306444285807e-05, "loss": 0.4808, "step": 18010 }, { "epoch": 1.7484960217349117, "grad_norm": 2.363404313519829, "learning_rate": 2.0926082159776757e-05, "loss": 0.4829, "step": 18020 }, { "epoch": 1.7494663302930333, "grad_norm": 1.771660432749116, "learning_rate": 2.0909857875267703e-05, "loss": 0.4838, "step": 18030 }, { "epoch": 1.7504366388511547, "grad_norm": 1.754309450094576, "learning_rate": 2.089363359075865e-05, "loss": 0.5289, "step": 18040 }, { "epoch": 1.7514069474092762, "grad_norm": 1.7394710721825013, "learning_rate": 2.0877409306249596e-05, "loss": 0.4681, "step": 18050 }, { "epoch": 1.7523772559673976, "grad_norm": 1.903002291309086, "learning_rate": 2.0861185021740542e-05, "loss": 0.4352, "step": 18060 }, { "epoch": 1.753347564525519, "grad_norm": 1.8599137833426156, "learning_rate": 2.084496073723149e-05, "loss": 0.5158, "step": 18070 }, { "epoch": 1.7543178730836406, "grad_norm": 2.416977234001108, "learning_rate": 2.0828736452722435e-05, "loss": 0.458, "step": 18080 }, { "epoch": 1.755288181641762, "grad_norm": 2.0471286021548876, "learning_rate": 2.081251216821338e-05, "loss": 0.5093, "step": 18090 }, { "epoch": 1.7562584901998837, "grad_norm": 1.676028789821819, "learning_rate": 2.079628788370433e-05, "loss": 0.4682, "step": 18100 }, { "epoch": 1.757228798758005, "grad_norm": 2.27723774043769, "learning_rate": 2.0780063599195278e-05, "loss": 0.4844, "step": 18110 }, { "epoch": 1.7581991073161265, "grad_norm": 1.9065062110447546, "learning_rate": 2.0763839314686224e-05, "loss": 0.4913, "step": 18120 }, { "epoch": 1.759169415874248, "grad_norm": 1.499218120899005, "learning_rate": 2.074761503017717e-05, "loss": 0.4764, "step": 18130 }, { "epoch": 1.7601397244323695, "grad_norm": 1.7111231535151605, "learning_rate": 2.0731390745668117e-05, "loss": 0.4376, "step": 18140 }, { "epoch": 1.761110032990491, "grad_norm": 2.1731054208801406, "learning_rate": 2.0715166461159063e-05, "loss": 0.5115, "step": 18150 }, { "epoch": 1.7620803415486126, "grad_norm": 2.537816428144228, "learning_rate": 2.069894217665001e-05, "loss": 0.4872, "step": 18160 }, { "epoch": 1.763050650106734, "grad_norm": 1.8860270703852153, "learning_rate": 2.0682717892140956e-05, "loss": 0.544, "step": 18170 }, { "epoch": 1.7640209586648554, "grad_norm": 2.021054656113362, "learning_rate": 2.0666493607631902e-05, "loss": 0.4595, "step": 18180 }, { "epoch": 1.7649912672229768, "grad_norm": 1.7916097460233942, "learning_rate": 2.0650269323122852e-05, "loss": 0.5098, "step": 18190 }, { "epoch": 1.7659615757810982, "grad_norm": 2.3100480138537, "learning_rate": 2.06340450386138e-05, "loss": 0.4801, "step": 18200 }, { "epoch": 1.7669318843392199, "grad_norm": 2.4377132473167413, "learning_rate": 2.0617820754104745e-05, "loss": 0.4507, "step": 18210 }, { "epoch": 1.7679021928973415, "grad_norm": 2.303644526731782, "learning_rate": 2.060159646959569e-05, "loss": 0.4557, "step": 18220 }, { "epoch": 1.768872501455463, "grad_norm": 2.104570073246246, "learning_rate": 2.0585372185086638e-05, "loss": 0.4657, "step": 18230 }, { "epoch": 1.7698428100135843, "grad_norm": 1.902037691736978, "learning_rate": 2.0569147900577584e-05, "loss": 0.4885, "step": 18240 }, { "epoch": 1.7708131185717058, "grad_norm": 1.8321649978057852, "learning_rate": 2.055292361606853e-05, "loss": 0.4953, "step": 18250 }, { "epoch": 1.7717834271298272, "grad_norm": 1.7318686323178791, "learning_rate": 2.0536699331559477e-05, "loss": 0.4993, "step": 18260 }, { "epoch": 1.7727537356879488, "grad_norm": 1.6339592524860076, "learning_rate": 2.0520475047050423e-05, "loss": 0.5311, "step": 18270 }, { "epoch": 1.7737240442460702, "grad_norm": 2.0377735402561448, "learning_rate": 2.0504250762541373e-05, "loss": 0.4785, "step": 18280 }, { "epoch": 1.7746943528041919, "grad_norm": 1.4282663698707199, "learning_rate": 2.0488026478032323e-05, "loss": 0.5077, "step": 18290 }, { "epoch": 1.7756646613623133, "grad_norm": 1.9149293437143238, "learning_rate": 2.047180219352327e-05, "loss": 0.5319, "step": 18300 }, { "epoch": 1.7766349699204347, "grad_norm": 2.1578236484875077, "learning_rate": 2.0455577909014215e-05, "loss": 0.4928, "step": 18310 }, { "epoch": 1.777605278478556, "grad_norm": 2.107005500307039, "learning_rate": 2.0439353624505162e-05, "loss": 0.4969, "step": 18320 }, { "epoch": 1.7785755870366775, "grad_norm": 1.9793180118959242, "learning_rate": 2.0423129339996108e-05, "loss": 0.4585, "step": 18330 }, { "epoch": 1.7795458955947991, "grad_norm": 1.8333976168598185, "learning_rate": 2.0406905055487055e-05, "loss": 0.4982, "step": 18340 }, { "epoch": 1.7805162041529208, "grad_norm": 1.9049818949759387, "learning_rate": 2.0390680770978e-05, "loss": 0.494, "step": 18350 }, { "epoch": 1.7814865127110422, "grad_norm": 1.9993776566406598, "learning_rate": 2.0374456486468947e-05, "loss": 0.4918, "step": 18360 }, { "epoch": 1.7824568212691636, "grad_norm": 1.901331422597336, "learning_rate": 2.0358232201959894e-05, "loss": 0.5138, "step": 18370 }, { "epoch": 1.783427129827285, "grad_norm": 1.6972755513639741, "learning_rate": 2.0342007917450843e-05, "loss": 0.4715, "step": 18380 }, { "epoch": 1.7843974383854064, "grad_norm": 1.7683176039213586, "learning_rate": 2.032578363294179e-05, "loss": 0.4917, "step": 18390 }, { "epoch": 1.785367746943528, "grad_norm": 2.473174371442726, "learning_rate": 2.0309559348432736e-05, "loss": 0.5202, "step": 18400 }, { "epoch": 1.7863380555016495, "grad_norm": 1.5926803956510256, "learning_rate": 2.0293335063923683e-05, "loss": 0.4988, "step": 18410 }, { "epoch": 1.7873083640597711, "grad_norm": 2.291647967076523, "learning_rate": 2.027711077941463e-05, "loss": 0.4824, "step": 18420 }, { "epoch": 1.7882786726178925, "grad_norm": 1.8562307329324135, "learning_rate": 2.0260886494905575e-05, "loss": 0.4563, "step": 18430 }, { "epoch": 1.789248981176014, "grad_norm": 2.0376982504169225, "learning_rate": 2.0244662210396522e-05, "loss": 0.5306, "step": 18440 }, { "epoch": 1.7902192897341354, "grad_norm": 2.2229061869152287, "learning_rate": 2.0228437925887468e-05, "loss": 0.4992, "step": 18450 }, { "epoch": 1.791189598292257, "grad_norm": 1.9381177105445806, "learning_rate": 2.0212213641378418e-05, "loss": 0.5163, "step": 18460 }, { "epoch": 1.7921599068503784, "grad_norm": 2.2038889339934276, "learning_rate": 2.0195989356869364e-05, "loss": 0.4872, "step": 18470 }, { "epoch": 1.7931302154085, "grad_norm": 1.7467002610372642, "learning_rate": 2.017976507236031e-05, "loss": 0.4876, "step": 18480 }, { "epoch": 1.7941005239666215, "grad_norm": 1.80516559021005, "learning_rate": 2.0163540787851257e-05, "loss": 0.5155, "step": 18490 }, { "epoch": 1.7950708325247429, "grad_norm": 1.6841944257951464, "learning_rate": 2.0147316503342203e-05, "loss": 0.485, "step": 18500 }, { "epoch": 1.7950708325247429, "eval_loss": 0.634519636631012, "eval_runtime": 2476.7893, "eval_samples_per_second": 0.724, "eval_steps_per_second": 0.362, "step": 18500 }, { "epoch": 1.7960411410828643, "grad_norm": 1.9439021855874303, "learning_rate": 2.013109221883315e-05, "loss": 0.452, "step": 18510 }, { "epoch": 1.7970114496409857, "grad_norm": 2.2513317080100226, "learning_rate": 2.0114867934324096e-05, "loss": 0.5325, "step": 18520 }, { "epoch": 1.7979817581991073, "grad_norm": 2.08111292912013, "learning_rate": 2.0098643649815043e-05, "loss": 0.5473, "step": 18530 }, { "epoch": 1.798952066757229, "grad_norm": 1.5733834399311872, "learning_rate": 2.008241936530599e-05, "loss": 0.477, "step": 18540 }, { "epoch": 1.7999223753153504, "grad_norm": 2.8502682562631954, "learning_rate": 2.006619508079694e-05, "loss": 0.5206, "step": 18550 }, { "epoch": 1.8008926838734718, "grad_norm": 1.753275957528235, "learning_rate": 2.0049970796287885e-05, "loss": 0.4547, "step": 18560 }, { "epoch": 1.8018629924315932, "grad_norm": 2.1314672834115473, "learning_rate": 2.003374651177883e-05, "loss": 0.5419, "step": 18570 }, { "epoch": 1.8028333009897146, "grad_norm": 1.9602167097784182, "learning_rate": 2.0017522227269778e-05, "loss": 0.5155, "step": 18580 }, { "epoch": 1.8038036095478363, "grad_norm": 1.938753313860514, "learning_rate": 2.0001297942760724e-05, "loss": 0.5068, "step": 18590 }, { "epoch": 1.8047739181059577, "grad_norm": 2.5245908920622924, "learning_rate": 1.998507365825167e-05, "loss": 0.4823, "step": 18600 }, { "epoch": 1.8057442266640793, "grad_norm": 1.8472353172834934, "learning_rate": 1.9968849373742617e-05, "loss": 0.5307, "step": 18610 }, { "epoch": 1.8067145352222007, "grad_norm": 2.119846040052731, "learning_rate": 1.9952625089233563e-05, "loss": 0.5497, "step": 18620 }, { "epoch": 1.8076848437803221, "grad_norm": 1.8254352104689588, "learning_rate": 1.9936400804724513e-05, "loss": 0.5343, "step": 18630 }, { "epoch": 1.8086551523384435, "grad_norm": 2.286926975318207, "learning_rate": 1.992017652021546e-05, "loss": 0.5079, "step": 18640 }, { "epoch": 1.809625460896565, "grad_norm": 2.145456846814811, "learning_rate": 1.9903952235706406e-05, "loss": 0.4792, "step": 18650 }, { "epoch": 1.8105957694546866, "grad_norm": 2.1237990360529615, "learning_rate": 1.9887727951197352e-05, "loss": 0.5221, "step": 18660 }, { "epoch": 1.8115660780128082, "grad_norm": 1.9365349043963929, "learning_rate": 1.98715036666883e-05, "loss": 0.4994, "step": 18670 }, { "epoch": 1.8125363865709296, "grad_norm": 2.1477510328366543, "learning_rate": 1.985527938217925e-05, "loss": 0.5051, "step": 18680 }, { "epoch": 1.813506695129051, "grad_norm": 2.24009021245378, "learning_rate": 1.9839055097670195e-05, "loss": 0.4411, "step": 18690 }, { "epoch": 1.8144770036871725, "grad_norm": 2.161147622755333, "learning_rate": 1.982283081316114e-05, "loss": 0.4947, "step": 18700 }, { "epoch": 1.8154473122452939, "grad_norm": 2.5777752480302296, "learning_rate": 1.9806606528652088e-05, "loss": 0.5027, "step": 18710 }, { "epoch": 1.8164176208034155, "grad_norm": 1.5289437881489423, "learning_rate": 1.9790382244143034e-05, "loss": 0.5276, "step": 18720 }, { "epoch": 1.817387929361537, "grad_norm": 1.808692912848697, "learning_rate": 1.977415795963398e-05, "loss": 0.5155, "step": 18730 }, { "epoch": 1.8183582379196586, "grad_norm": 1.7568825751279589, "learning_rate": 1.975793367512493e-05, "loss": 0.4658, "step": 18740 }, { "epoch": 1.81932854647778, "grad_norm": 1.5226619183552335, "learning_rate": 1.9741709390615877e-05, "loss": 0.4764, "step": 18750 }, { "epoch": 1.8202988550359014, "grad_norm": 1.5664856097734037, "learning_rate": 1.9725485106106823e-05, "loss": 0.4503, "step": 18760 }, { "epoch": 1.8212691635940228, "grad_norm": 1.7127846075920363, "learning_rate": 1.970926082159777e-05, "loss": 0.4558, "step": 18770 }, { "epoch": 1.8222394721521444, "grad_norm": 2.1030557687163007, "learning_rate": 1.9693036537088716e-05, "loss": 0.5578, "step": 18780 }, { "epoch": 1.8232097807102658, "grad_norm": 2.0907960296944474, "learning_rate": 1.9676812252579662e-05, "loss": 0.4974, "step": 18790 }, { "epoch": 1.8241800892683875, "grad_norm": 1.4612012638589904, "learning_rate": 1.966058796807061e-05, "loss": 0.4931, "step": 18800 }, { "epoch": 1.825150397826509, "grad_norm": 1.6970216305431254, "learning_rate": 1.9644363683561555e-05, "loss": 0.4691, "step": 18810 }, { "epoch": 1.8261207063846303, "grad_norm": 2.4415425125364125, "learning_rate": 1.9628139399052505e-05, "loss": 0.5249, "step": 18820 }, { "epoch": 1.8270910149427517, "grad_norm": 1.7771990898290009, "learning_rate": 1.961191511454345e-05, "loss": 0.5019, "step": 18830 }, { "epoch": 1.8280613235008731, "grad_norm": 1.7839717190667712, "learning_rate": 1.9595690830034397e-05, "loss": 0.5052, "step": 18840 }, { "epoch": 1.8290316320589948, "grad_norm": 2.209198913639542, "learning_rate": 1.9579466545525344e-05, "loss": 0.5235, "step": 18850 }, { "epoch": 1.8300019406171164, "grad_norm": 2.200681320097298, "learning_rate": 1.956324226101629e-05, "loss": 0.5302, "step": 18860 }, { "epoch": 1.8309722491752378, "grad_norm": 2.000921134970765, "learning_rate": 1.9547017976507237e-05, "loss": 0.4648, "step": 18870 }, { "epoch": 1.8319425577333592, "grad_norm": 2.623565257277865, "learning_rate": 1.9530793691998183e-05, "loss": 0.4855, "step": 18880 }, { "epoch": 1.8329128662914806, "grad_norm": 1.8550961862060569, "learning_rate": 1.951456940748913e-05, "loss": 0.5135, "step": 18890 }, { "epoch": 1.833883174849602, "grad_norm": 2.0777083761268718, "learning_rate": 1.9498345122980076e-05, "loss": 0.5236, "step": 18900 }, { "epoch": 1.8348534834077237, "grad_norm": 2.2339320492583337, "learning_rate": 1.9482120838471025e-05, "loss": 0.5336, "step": 18910 }, { "epoch": 1.835823791965845, "grad_norm": 2.2130179741978173, "learning_rate": 1.9465896553961972e-05, "loss": 0.4468, "step": 18920 }, { "epoch": 1.8367941005239667, "grad_norm": 1.9136609274036684, "learning_rate": 1.9449672269452918e-05, "loss": 0.516, "step": 18930 }, { "epoch": 1.8377644090820882, "grad_norm": 1.9319944175941055, "learning_rate": 1.9433447984943865e-05, "loss": 0.4837, "step": 18940 }, { "epoch": 1.8387347176402096, "grad_norm": 1.8144243996167448, "learning_rate": 1.941722370043481e-05, "loss": 0.5388, "step": 18950 }, { "epoch": 1.839705026198331, "grad_norm": 1.9871922674209272, "learning_rate": 1.9400999415925757e-05, "loss": 0.5543, "step": 18960 }, { "epoch": 1.8406753347564524, "grad_norm": 2.1436846647503707, "learning_rate": 1.9384775131416704e-05, "loss": 0.4787, "step": 18970 }, { "epoch": 1.841645643314574, "grad_norm": 1.926685455696982, "learning_rate": 1.936855084690765e-05, "loss": 0.5096, "step": 18980 }, { "epoch": 1.8426159518726957, "grad_norm": 2.342596116742962, "learning_rate": 1.93523265623986e-05, "loss": 0.522, "step": 18990 }, { "epoch": 1.843586260430817, "grad_norm": 2.255501885540124, "learning_rate": 1.9336102277889546e-05, "loss": 0.5377, "step": 19000 }, { "epoch": 1.843586260430817, "eval_loss": 0.6318312883377075, "eval_runtime": 2473.7549, "eval_samples_per_second": 0.724, "eval_steps_per_second": 0.362, "step": 19000 }, { "epoch": 1.8445565689889385, "grad_norm": 2.1596583204757884, "learning_rate": 1.9319877993380493e-05, "loss": 0.5629, "step": 19010 }, { "epoch": 1.84552687754706, "grad_norm": 1.8190379524057834, "learning_rate": 1.930365370887144e-05, "loss": 0.4796, "step": 19020 }, { "epoch": 1.8464971861051813, "grad_norm": 1.019855687196459, "learning_rate": 1.9287429424362385e-05, "loss": 0.471, "step": 19030 }, { "epoch": 1.847467494663303, "grad_norm": 1.9569552509020551, "learning_rate": 1.9271205139853332e-05, "loss": 0.5126, "step": 19040 }, { "epoch": 1.8484378032214244, "grad_norm": 2.1870238918367524, "learning_rate": 1.9254980855344278e-05, "loss": 0.4865, "step": 19050 }, { "epoch": 1.849408111779546, "grad_norm": 1.594729244733264, "learning_rate": 1.9238756570835225e-05, "loss": 0.4802, "step": 19060 }, { "epoch": 1.8503784203376674, "grad_norm": 2.1426009139912843, "learning_rate": 1.9222532286326174e-05, "loss": 0.5303, "step": 19070 }, { "epoch": 1.8513487288957888, "grad_norm": 1.7760248540972265, "learning_rate": 1.920630800181712e-05, "loss": 0.4948, "step": 19080 }, { "epoch": 1.8523190374539102, "grad_norm": 1.6725671761260958, "learning_rate": 1.919008371730807e-05, "loss": 0.4922, "step": 19090 }, { "epoch": 1.8532893460120319, "grad_norm": 1.9565774055440965, "learning_rate": 1.9173859432799017e-05, "loss": 0.5658, "step": 19100 }, { "epoch": 1.8542596545701533, "grad_norm": 2.231350831039019, "learning_rate": 1.9157635148289963e-05, "loss": 0.4906, "step": 19110 }, { "epoch": 1.855229963128275, "grad_norm": 1.5993699848453529, "learning_rate": 1.914141086378091e-05, "loss": 0.4796, "step": 19120 }, { "epoch": 1.8562002716863963, "grad_norm": 2.0780226359466387, "learning_rate": 1.9125186579271856e-05, "loss": 0.4773, "step": 19130 }, { "epoch": 1.8571705802445178, "grad_norm": 2.01763881138038, "learning_rate": 1.9108962294762802e-05, "loss": 0.4984, "step": 19140 }, { "epoch": 1.8581408888026392, "grad_norm": 2.240723917017317, "learning_rate": 1.909273801025375e-05, "loss": 0.4559, "step": 19150 }, { "epoch": 1.8591111973607606, "grad_norm": 1.849249605812625, "learning_rate": 1.9076513725744695e-05, "loss": 0.5445, "step": 19160 }, { "epoch": 1.8600815059188822, "grad_norm": 1.7656331962779384, "learning_rate": 1.906028944123564e-05, "loss": 0.466, "step": 19170 }, { "epoch": 1.8610518144770039, "grad_norm": 1.9723478771428924, "learning_rate": 1.904406515672659e-05, "loss": 0.4951, "step": 19180 }, { "epoch": 1.8620221230351253, "grad_norm": 2.133741060165629, "learning_rate": 1.9027840872217538e-05, "loss": 0.5064, "step": 19190 }, { "epoch": 1.8629924315932467, "grad_norm": 2.267317304504692, "learning_rate": 1.9011616587708484e-05, "loss": 0.4774, "step": 19200 }, { "epoch": 1.863962740151368, "grad_norm": 1.6590470147649787, "learning_rate": 1.899539230319943e-05, "loss": 0.4684, "step": 19210 }, { "epoch": 1.8649330487094895, "grad_norm": 1.7428570168415043, "learning_rate": 1.8979168018690377e-05, "loss": 0.494, "step": 19220 }, { "epoch": 1.8659033572676111, "grad_norm": 1.6212375134091286, "learning_rate": 1.8962943734181323e-05, "loss": 0.481, "step": 19230 }, { "epoch": 1.8668736658257326, "grad_norm": 2.0261745446913717, "learning_rate": 1.894671944967227e-05, "loss": 0.5107, "step": 19240 }, { "epoch": 1.8678439743838542, "grad_norm": 1.7751731877262686, "learning_rate": 1.8930495165163216e-05, "loss": 0.4615, "step": 19250 }, { "epoch": 1.8688142829419756, "grad_norm": 1.5566178962662132, "learning_rate": 1.8914270880654162e-05, "loss": 0.5061, "step": 19260 }, { "epoch": 1.869784591500097, "grad_norm": 2.0648415585072137, "learning_rate": 1.8898046596145112e-05, "loss": 0.4721, "step": 19270 }, { "epoch": 1.8707549000582184, "grad_norm": 1.8255664078089486, "learning_rate": 1.888182231163606e-05, "loss": 0.4652, "step": 19280 }, { "epoch": 1.8717252086163398, "grad_norm": 2.144758151637292, "learning_rate": 1.8865598027127005e-05, "loss": 0.4391, "step": 19290 }, { "epoch": 1.8726955171744615, "grad_norm": 1.803874603824675, "learning_rate": 1.884937374261795e-05, "loss": 0.492, "step": 19300 }, { "epoch": 1.8736658257325831, "grad_norm": 2.1603319792398685, "learning_rate": 1.8833149458108898e-05, "loss": 0.4602, "step": 19310 }, { "epoch": 1.8746361342907045, "grad_norm": 1.931469165672542, "learning_rate": 1.8816925173599844e-05, "loss": 0.479, "step": 19320 }, { "epoch": 1.875606442848826, "grad_norm": 2.289215850770512, "learning_rate": 1.880070088909079e-05, "loss": 0.475, "step": 19330 }, { "epoch": 1.8765767514069474, "grad_norm": 2.0548701793141384, "learning_rate": 1.8784476604581737e-05, "loss": 0.4523, "step": 19340 }, { "epoch": 1.8775470599650688, "grad_norm": 1.6304292402973444, "learning_rate": 1.8768252320072687e-05, "loss": 0.4704, "step": 19350 }, { "epoch": 1.8785173685231904, "grad_norm": 1.3760981254681826, "learning_rate": 1.8752028035563633e-05, "loss": 0.509, "step": 19360 }, { "epoch": 1.8794876770813118, "grad_norm": 2.1389308936606364, "learning_rate": 1.873580375105458e-05, "loss": 0.4791, "step": 19370 }, { "epoch": 1.8804579856394334, "grad_norm": 2.0673076161036663, "learning_rate": 1.8719579466545526e-05, "loss": 0.4902, "step": 19380 }, { "epoch": 1.8814282941975549, "grad_norm": 1.861431210462014, "learning_rate": 1.8703355182036472e-05, "loss": 0.4719, "step": 19390 }, { "epoch": 1.8823986027556763, "grad_norm": 1.9788908151587594, "learning_rate": 1.868713089752742e-05, "loss": 0.5127, "step": 19400 }, { "epoch": 1.8833689113137977, "grad_norm": 1.9487695948736958, "learning_rate": 1.8670906613018365e-05, "loss": 0.4469, "step": 19410 }, { "epoch": 1.8843392198719193, "grad_norm": 1.8048413444236069, "learning_rate": 1.865468232850931e-05, "loss": 0.5345, "step": 19420 }, { "epoch": 1.8853095284300407, "grad_norm": 1.8168395167881652, "learning_rate": 1.8638458044000258e-05, "loss": 0.4957, "step": 19430 }, { "epoch": 1.8862798369881624, "grad_norm": 1.9900462235084369, "learning_rate": 1.8622233759491207e-05, "loss": 0.4928, "step": 19440 }, { "epoch": 1.8872501455462838, "grad_norm": 1.9978639659195843, "learning_rate": 1.8606009474982154e-05, "loss": 0.5085, "step": 19450 }, { "epoch": 1.8882204541044052, "grad_norm": 1.9870398133754268, "learning_rate": 1.8589785190473104e-05, "loss": 0.4857, "step": 19460 }, { "epoch": 1.8891907626625266, "grad_norm": 1.9723654086733247, "learning_rate": 1.857356090596405e-05, "loss": 0.4658, "step": 19470 }, { "epoch": 1.890161071220648, "grad_norm": 1.8141064421371964, "learning_rate": 1.8557336621454996e-05, "loss": 0.4881, "step": 19480 }, { "epoch": 1.8911313797787697, "grad_norm": 1.6779273277493256, "learning_rate": 1.8541112336945943e-05, "loss": 0.4798, "step": 19490 }, { "epoch": 1.8921016883368913, "grad_norm": 1.901625964641795, "learning_rate": 1.852488805243689e-05, "loss": 0.5199, "step": 19500 }, { "epoch": 1.8921016883368913, "eval_loss": 0.631912350654602, "eval_runtime": 2473.0661, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.362, "step": 19500 }, { "epoch": 1.8930719968950127, "grad_norm": 1.7019367377384935, "learning_rate": 1.8508663767927836e-05, "loss": 0.4693, "step": 19510 }, { "epoch": 1.8940423054531341, "grad_norm": 1.9777898503801572, "learning_rate": 1.8492439483418782e-05, "loss": 0.4907, "step": 19520 }, { "epoch": 1.8950126140112555, "grad_norm": 2.613311730337197, "learning_rate": 1.8476215198909728e-05, "loss": 0.5179, "step": 19530 }, { "epoch": 1.895982922569377, "grad_norm": 1.3976847929675515, "learning_rate": 1.8459990914400678e-05, "loss": 0.4316, "step": 19540 }, { "epoch": 1.8969532311274986, "grad_norm": 2.1428783623843715, "learning_rate": 1.8443766629891624e-05, "loss": 0.5275, "step": 19550 }, { "epoch": 1.89792353968562, "grad_norm": 1.6878966830886573, "learning_rate": 1.842754234538257e-05, "loss": 0.5061, "step": 19560 }, { "epoch": 1.8988938482437416, "grad_norm": 1.6660923395832288, "learning_rate": 1.8411318060873517e-05, "loss": 0.4846, "step": 19570 }, { "epoch": 1.899864156801863, "grad_norm": 1.88350077160719, "learning_rate": 1.8395093776364464e-05, "loss": 0.4558, "step": 19580 }, { "epoch": 1.9008344653599845, "grad_norm": 1.8180585161160603, "learning_rate": 1.837886949185541e-05, "loss": 0.4462, "step": 19590 }, { "epoch": 1.9018047739181059, "grad_norm": 2.17218483201875, "learning_rate": 1.8362645207346356e-05, "loss": 0.4732, "step": 19600 }, { "epoch": 1.9027750824762273, "grad_norm": 2.150072249394045, "learning_rate": 1.8346420922837303e-05, "loss": 0.4807, "step": 19610 }, { "epoch": 1.903745391034349, "grad_norm": 2.186590190383392, "learning_rate": 1.8330196638328253e-05, "loss": 0.5064, "step": 19620 }, { "epoch": 1.9047156995924706, "grad_norm": 1.9773135303028604, "learning_rate": 1.83139723538192e-05, "loss": 0.4987, "step": 19630 }, { "epoch": 1.905686008150592, "grad_norm": 1.7722699958849875, "learning_rate": 1.8297748069310145e-05, "loss": 0.493, "step": 19640 }, { "epoch": 1.9066563167087134, "grad_norm": 2.398748653045921, "learning_rate": 1.828152378480109e-05, "loss": 0.4632, "step": 19650 }, { "epoch": 1.9076266252668348, "grad_norm": 1.9381638418073313, "learning_rate": 1.8265299500292038e-05, "loss": 0.4879, "step": 19660 }, { "epoch": 1.9085969338249562, "grad_norm": 1.806142449327403, "learning_rate": 1.8249075215782984e-05, "loss": 0.4426, "step": 19670 }, { "epoch": 1.9095672423830778, "grad_norm": 1.8375555769454368, "learning_rate": 1.823285093127393e-05, "loss": 0.5133, "step": 19680 }, { "epoch": 1.9105375509411993, "grad_norm": 2.1900711143018703, "learning_rate": 1.8216626646764877e-05, "loss": 0.4783, "step": 19690 }, { "epoch": 1.911507859499321, "grad_norm": 1.7860565394856096, "learning_rate": 1.8200402362255824e-05, "loss": 0.5432, "step": 19700 }, { "epoch": 1.9124781680574423, "grad_norm": 1.5287991043658957, "learning_rate": 1.8184178077746773e-05, "loss": 0.4818, "step": 19710 }, { "epoch": 1.9134484766155637, "grad_norm": 1.9162344859036078, "learning_rate": 1.816795379323772e-05, "loss": 0.5134, "step": 19720 }, { "epoch": 1.9144187851736851, "grad_norm": 1.9361948714776511, "learning_rate": 1.8151729508728666e-05, "loss": 0.4705, "step": 19730 }, { "epoch": 1.9153890937318068, "grad_norm": 1.7948070446854587, "learning_rate": 1.8135505224219613e-05, "loss": 0.4713, "step": 19740 }, { "epoch": 1.9163594022899282, "grad_norm": 1.94331048872498, "learning_rate": 1.811928093971056e-05, "loss": 0.4636, "step": 19750 }, { "epoch": 1.9173297108480498, "grad_norm": 1.9083792067669565, "learning_rate": 1.8103056655201505e-05, "loss": 0.4985, "step": 19760 }, { "epoch": 1.9183000194061712, "grad_norm": 1.7889544788537133, "learning_rate": 1.808683237069245e-05, "loss": 0.465, "step": 19770 }, { "epoch": 1.9192703279642926, "grad_norm": 2.1133403229015606, "learning_rate": 1.8070608086183398e-05, "loss": 0.4962, "step": 19780 }, { "epoch": 1.920240636522414, "grad_norm": 2.119511574698095, "learning_rate": 1.8054383801674344e-05, "loss": 0.5265, "step": 19790 }, { "epoch": 1.9212109450805355, "grad_norm": 1.8530048672687303, "learning_rate": 1.8038159517165294e-05, "loss": 0.4674, "step": 19800 }, { "epoch": 1.922181253638657, "grad_norm": 2.060163549018839, "learning_rate": 1.802193523265624e-05, "loss": 0.5232, "step": 19810 }, { "epoch": 1.9231515621967787, "grad_norm": 1.582662573057051, "learning_rate": 1.8005710948147187e-05, "loss": 0.5146, "step": 19820 }, { "epoch": 1.9241218707549002, "grad_norm": 1.9973478575975316, "learning_rate": 1.7989486663638133e-05, "loss": 0.5294, "step": 19830 }, { "epoch": 1.9250921793130216, "grad_norm": 2.2324903918022088, "learning_rate": 1.797326237912908e-05, "loss": 0.5039, "step": 19840 }, { "epoch": 1.926062487871143, "grad_norm": 2.24550841489829, "learning_rate": 1.7957038094620026e-05, "loss": 0.4763, "step": 19850 }, { "epoch": 1.9270327964292644, "grad_norm": 1.9055133702740896, "learning_rate": 1.7940813810110976e-05, "loss": 0.4978, "step": 19860 }, { "epoch": 1.928003104987386, "grad_norm": 1.6202922943730298, "learning_rate": 1.7924589525601922e-05, "loss": 0.4467, "step": 19870 }, { "epoch": 1.9289734135455074, "grad_norm": 1.7946217889300042, "learning_rate": 1.790836524109287e-05, "loss": 0.4975, "step": 19880 }, { "epoch": 1.929943722103629, "grad_norm": 2.0837037474907776, "learning_rate": 1.7892140956583815e-05, "loss": 0.4488, "step": 19890 }, { "epoch": 1.9309140306617505, "grad_norm": 2.284301097020319, "learning_rate": 1.7875916672074765e-05, "loss": 0.5315, "step": 19900 }, { "epoch": 1.931884339219872, "grad_norm": 1.6410720035809963, "learning_rate": 1.785969238756571e-05, "loss": 0.5064, "step": 19910 }, { "epoch": 1.9328546477779933, "grad_norm": 1.913707413180516, "learning_rate": 1.7843468103056658e-05, "loss": 0.5226, "step": 19920 }, { "epoch": 1.9338249563361147, "grad_norm": 1.972749697853565, "learning_rate": 1.7827243818547604e-05, "loss": 0.4794, "step": 19930 }, { "epoch": 1.9347952648942364, "grad_norm": 2.4245035874963663, "learning_rate": 1.781101953403855e-05, "loss": 0.472, "step": 19940 }, { "epoch": 1.935765573452358, "grad_norm": 1.6532981118899497, "learning_rate": 1.7794795249529497e-05, "loss": 0.4926, "step": 19950 }, { "epoch": 1.9367358820104794, "grad_norm": 2.301368908039538, "learning_rate": 1.7778570965020443e-05, "loss": 0.4847, "step": 19960 }, { "epoch": 1.9377061905686008, "grad_norm": 1.9798901797296686, "learning_rate": 1.776234668051139e-05, "loss": 0.5041, "step": 19970 }, { "epoch": 1.9386764991267222, "grad_norm": 2.0914880764103825, "learning_rate": 1.774612239600234e-05, "loss": 0.4543, "step": 19980 }, { "epoch": 1.9396468076848437, "grad_norm": 1.1710098992705869, "learning_rate": 1.7729898111493286e-05, "loss": 0.4568, "step": 19990 }, { "epoch": 1.9406171162429653, "grad_norm": 2.475821372597333, "learning_rate": 1.7713673826984232e-05, "loss": 0.503, "step": 20000 }, { "epoch": 1.9406171162429653, "eval_loss": 0.6306876540184021, "eval_runtime": 2470.8815, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.363, "step": 20000 }, { "epoch": 1.9415874248010867, "grad_norm": 1.8135605973580828, "learning_rate": 1.769744954247518e-05, "loss": 0.5225, "step": 20010 }, { "epoch": 1.9425577333592083, "grad_norm": 1.8818540999249187, "learning_rate": 1.7681225257966125e-05, "loss": 0.4411, "step": 20020 }, { "epoch": 1.9435280419173298, "grad_norm": 2.3455403519185767, "learning_rate": 1.766500097345707e-05, "loss": 0.4618, "step": 20030 }, { "epoch": 1.9444983504754512, "grad_norm": 1.8781392474090728, "learning_rate": 1.7648776688948018e-05, "loss": 0.4757, "step": 20040 }, { "epoch": 1.9454686590335726, "grad_norm": 2.4837262255821004, "learning_rate": 1.7632552404438964e-05, "loss": 0.4983, "step": 20050 }, { "epoch": 1.9464389675916942, "grad_norm": 2.0371146667611963, "learning_rate": 1.761632811992991e-05, "loss": 0.4684, "step": 20060 }, { "epoch": 1.9474092761498156, "grad_norm": 1.9070841836100068, "learning_rate": 1.760010383542086e-05, "loss": 0.5148, "step": 20070 }, { "epoch": 1.9483795847079373, "grad_norm": 1.6764022613616618, "learning_rate": 1.7583879550911806e-05, "loss": 0.5129, "step": 20080 }, { "epoch": 1.9493498932660587, "grad_norm": 1.801575932413747, "learning_rate": 1.7567655266402753e-05, "loss": 0.4924, "step": 20090 }, { "epoch": 1.95032020182418, "grad_norm": 1.417352324977605, "learning_rate": 1.75514309818937e-05, "loss": 0.4871, "step": 20100 }, { "epoch": 1.9512905103823015, "grad_norm": 1.7430495978746263, "learning_rate": 1.7535206697384646e-05, "loss": 0.4906, "step": 20110 }, { "epoch": 1.952260818940423, "grad_norm": 1.9378606596579797, "learning_rate": 1.7518982412875592e-05, "loss": 0.4441, "step": 20120 }, { "epoch": 1.9532311274985446, "grad_norm": 2.024032521983931, "learning_rate": 1.750275812836654e-05, "loss": 0.4574, "step": 20130 }, { "epoch": 1.9542014360566662, "grad_norm": 1.9748616447898866, "learning_rate": 1.7486533843857485e-05, "loss": 0.4955, "step": 20140 }, { "epoch": 1.9551717446147876, "grad_norm": 2.0789037317300876, "learning_rate": 1.7470309559348435e-05, "loss": 0.5176, "step": 20150 }, { "epoch": 1.956142053172909, "grad_norm": 1.7669158731638315, "learning_rate": 1.745408527483938e-05, "loss": 0.4242, "step": 20160 }, { "epoch": 1.9571123617310304, "grad_norm": 2.15917406316717, "learning_rate": 1.7437860990330327e-05, "loss": 0.457, "step": 20170 }, { "epoch": 1.9580826702891518, "grad_norm": 1.7798687601178151, "learning_rate": 1.7421636705821274e-05, "loss": 0.467, "step": 20180 }, { "epoch": 1.9590529788472735, "grad_norm": 2.2050151044492154, "learning_rate": 1.740541242131222e-05, "loss": 0.4562, "step": 20190 }, { "epoch": 1.9600232874053949, "grad_norm": 1.7571600556165854, "learning_rate": 1.7389188136803166e-05, "loss": 0.4951, "step": 20200 }, { "epoch": 1.9609935959635165, "grad_norm": 2.370912740726444, "learning_rate": 1.7372963852294113e-05, "loss": 0.5177, "step": 20210 }, { "epoch": 1.961963904521638, "grad_norm": 1.9598530660426143, "learning_rate": 1.735673956778506e-05, "loss": 0.4898, "step": 20220 }, { "epoch": 1.9629342130797593, "grad_norm": 1.8458335418058476, "learning_rate": 1.7340515283276006e-05, "loss": 0.4685, "step": 20230 }, { "epoch": 1.9639045216378808, "grad_norm": 2.086603579490736, "learning_rate": 1.7324290998766955e-05, "loss": 0.4655, "step": 20240 }, { "epoch": 1.9648748301960022, "grad_norm": 1.8861003799697185, "learning_rate": 1.7308066714257902e-05, "loss": 0.5012, "step": 20250 }, { "epoch": 1.9658451387541238, "grad_norm": 2.0410888085121655, "learning_rate": 1.729184242974885e-05, "loss": 0.4686, "step": 20260 }, { "epoch": 1.9668154473122454, "grad_norm": 2.0486563420463293, "learning_rate": 1.7275618145239798e-05, "loss": 0.4971, "step": 20270 }, { "epoch": 1.9677857558703669, "grad_norm": 2.053946424813733, "learning_rate": 1.7259393860730744e-05, "loss": 0.5002, "step": 20280 }, { "epoch": 1.9687560644284883, "grad_norm": 1.933061743356704, "learning_rate": 1.724316957622169e-05, "loss": 0.5072, "step": 20290 }, { "epoch": 1.9697263729866097, "grad_norm": 2.066749720982141, "learning_rate": 1.7226945291712637e-05, "loss": 0.4928, "step": 20300 }, { "epoch": 1.970696681544731, "grad_norm": 1.55984280015245, "learning_rate": 1.7210721007203583e-05, "loss": 0.4681, "step": 20310 }, { "epoch": 1.9716669901028527, "grad_norm": 2.0658531156309254, "learning_rate": 1.719449672269453e-05, "loss": 0.4451, "step": 20320 }, { "epoch": 1.9726372986609741, "grad_norm": 1.56362708496437, "learning_rate": 1.7178272438185476e-05, "loss": 0.5221, "step": 20330 }, { "epoch": 1.9736076072190958, "grad_norm": 2.161774232597424, "learning_rate": 1.7162048153676426e-05, "loss": 0.5527, "step": 20340 }, { "epoch": 1.9745779157772172, "grad_norm": 2.411930244708304, "learning_rate": 1.7145823869167372e-05, "loss": 0.4742, "step": 20350 }, { "epoch": 1.9755482243353386, "grad_norm": 2.0274695839961803, "learning_rate": 1.712959958465832e-05, "loss": 0.5055, "step": 20360 }, { "epoch": 1.97651853289346, "grad_norm": 1.9046814027988292, "learning_rate": 1.7113375300149265e-05, "loss": 0.5018, "step": 20370 }, { "epoch": 1.9774888414515817, "grad_norm": 2.6745574351193437, "learning_rate": 1.709715101564021e-05, "loss": 0.5013, "step": 20380 }, { "epoch": 1.978459150009703, "grad_norm": 1.983809422276857, "learning_rate": 1.7080926731131158e-05, "loss": 0.506, "step": 20390 }, { "epoch": 1.9794294585678247, "grad_norm": 1.8313417501684919, "learning_rate": 1.7064702446622104e-05, "loss": 0.4913, "step": 20400 }, { "epoch": 1.9803997671259461, "grad_norm": 1.9488632706180251, "learning_rate": 1.704847816211305e-05, "loss": 0.4753, "step": 20410 }, { "epoch": 1.9813700756840675, "grad_norm": 2.1235067779704386, "learning_rate": 1.7032253877603997e-05, "loss": 0.5076, "step": 20420 }, { "epoch": 1.982340384242189, "grad_norm": 1.9628606937438253, "learning_rate": 1.7016029593094947e-05, "loss": 0.4488, "step": 20430 }, { "epoch": 1.9833106928003104, "grad_norm": 2.053405187246396, "learning_rate": 1.6999805308585893e-05, "loss": 0.4492, "step": 20440 }, { "epoch": 1.984281001358432, "grad_norm": 2.027877392955927, "learning_rate": 1.698358102407684e-05, "loss": 0.4481, "step": 20450 }, { "epoch": 1.9852513099165536, "grad_norm": 2.6424653212507585, "learning_rate": 1.6967356739567786e-05, "loss": 0.4866, "step": 20460 }, { "epoch": 1.986221618474675, "grad_norm": 1.798153398364296, "learning_rate": 1.6951132455058732e-05, "loss": 0.4901, "step": 20470 }, { "epoch": 1.9871919270327965, "grad_norm": 1.9375715199504966, "learning_rate": 1.693490817054968e-05, "loss": 0.5145, "step": 20480 }, { "epoch": 1.9881622355909179, "grad_norm": 2.227384783436997, "learning_rate": 1.6918683886040625e-05, "loss": 0.5234, "step": 20490 }, { "epoch": 1.9891325441490393, "grad_norm": 1.6406328006727198, "learning_rate": 1.690245960153157e-05, "loss": 0.4938, "step": 20500 }, { "epoch": 1.9891325441490393, "eval_loss": 0.6277603507041931, "eval_runtime": 3080.184, "eval_samples_per_second": 0.582, "eval_steps_per_second": 0.291, "step": 20500 }, { "epoch": 1.990102852707161, "grad_norm": 2.189175771584087, "learning_rate": 1.688623531702252e-05, "loss": 0.4815, "step": 20510 }, { "epoch": 1.9910731612652823, "grad_norm": 1.9037268932948403, "learning_rate": 1.6870011032513468e-05, "loss": 0.4675, "step": 20520 }, { "epoch": 1.992043469823404, "grad_norm": 2.1370328259985096, "learning_rate": 1.6853786748004414e-05, "loss": 0.4696, "step": 20530 }, { "epoch": 1.9930137783815254, "grad_norm": 2.407631343670518, "learning_rate": 1.683756246349536e-05, "loss": 0.4904, "step": 20540 }, { "epoch": 1.9939840869396468, "grad_norm": 2.113566676023369, "learning_rate": 1.6821338178986307e-05, "loss": 0.4713, "step": 20550 }, { "epoch": 1.9949543954977682, "grad_norm": 1.7998412815701594, "learning_rate": 1.6805113894477253e-05, "loss": 0.4638, "step": 20560 }, { "epoch": 1.9959247040558896, "grad_norm": 2.129222373307167, "learning_rate": 1.67888896099682e-05, "loss": 0.4567, "step": 20570 }, { "epoch": 1.9968950126140113, "grad_norm": 2.0439893320145197, "learning_rate": 1.6772665325459146e-05, "loss": 0.499, "step": 20580 }, { "epoch": 1.997865321172133, "grad_norm": 1.996718548398611, "learning_rate": 1.6756441040950092e-05, "loss": 0.4657, "step": 20590 }, { "epoch": 1.9988356297302543, "grad_norm": 1.8047941252901814, "learning_rate": 1.6740216756441042e-05, "loss": 0.4723, "step": 20600 }, { "epoch": 1.9998059382883757, "grad_norm": 1.8452676210065997, "learning_rate": 1.672399247193199e-05, "loss": 0.4802, "step": 20610 }, { "epoch": 2.000776246846497, "grad_norm": 2.2446601261564254, "learning_rate": 1.6707768187422935e-05, "loss": 0.501, "step": 20620 }, { "epoch": 2.0017465554046185, "grad_norm": 1.6471605415361736, "learning_rate": 1.669154390291388e-05, "loss": 0.4735, "step": 20630 }, { "epoch": 2.00271686396274, "grad_norm": 2.038536155314213, "learning_rate": 1.667531961840483e-05, "loss": 0.4575, "step": 20640 }, { "epoch": 2.003687172520862, "grad_norm": 2.1416733149321834, "learning_rate": 1.6659095333895777e-05, "loss": 0.5152, "step": 20650 }, { "epoch": 2.0046574810789832, "grad_norm": 1.9639254031456526, "learning_rate": 1.6642871049386724e-05, "loss": 0.4424, "step": 20660 }, { "epoch": 2.0056277896371046, "grad_norm": 2.1201945824283777, "learning_rate": 1.662664676487767e-05, "loss": 0.5194, "step": 20670 }, { "epoch": 2.006598098195226, "grad_norm": 2.0259029186156616, "learning_rate": 1.6610422480368617e-05, "loss": 0.5085, "step": 20680 }, { "epoch": 2.0075684067533475, "grad_norm": 1.5035201214590004, "learning_rate": 1.6594198195859563e-05, "loss": 0.445, "step": 20690 }, { "epoch": 2.008538715311469, "grad_norm": 1.655200868630425, "learning_rate": 1.6577973911350513e-05, "loss": 0.4247, "step": 20700 }, { "epoch": 2.0095090238695907, "grad_norm": 2.1707069905375764, "learning_rate": 1.656174962684146e-05, "loss": 0.4783, "step": 20710 }, { "epoch": 2.010479332427712, "grad_norm": 1.8212536925620644, "learning_rate": 1.6545525342332405e-05, "loss": 0.4972, "step": 20720 }, { "epoch": 2.0114496409858336, "grad_norm": 2.126405409204561, "learning_rate": 1.6529301057823352e-05, "loss": 0.4878, "step": 20730 }, { "epoch": 2.012419949543955, "grad_norm": 2.077712603387953, "learning_rate": 1.6513076773314298e-05, "loss": 0.4571, "step": 20740 }, { "epoch": 2.0133902581020764, "grad_norm": 2.1605056660181345, "learning_rate": 1.6496852488805245e-05, "loss": 0.5012, "step": 20750 }, { "epoch": 2.014360566660198, "grad_norm": 1.8021397618450556, "learning_rate": 1.648062820429619e-05, "loss": 0.5049, "step": 20760 }, { "epoch": 2.015330875218319, "grad_norm": 1.8857630619437384, "learning_rate": 1.6464403919787137e-05, "loss": 0.4726, "step": 20770 }, { "epoch": 2.016301183776441, "grad_norm": 1.7074209121094917, "learning_rate": 1.6448179635278084e-05, "loss": 0.4914, "step": 20780 }, { "epoch": 2.0172714923345625, "grad_norm": 2.6019622648688614, "learning_rate": 1.6431955350769033e-05, "loss": 0.4239, "step": 20790 }, { "epoch": 2.018241800892684, "grad_norm": 1.6879446237580993, "learning_rate": 1.641573106625998e-05, "loss": 0.4657, "step": 20800 }, { "epoch": 2.0192121094508053, "grad_norm": 1.4603799114693456, "learning_rate": 1.6399506781750926e-05, "loss": 0.4503, "step": 20810 }, { "epoch": 2.0201824180089267, "grad_norm": 1.948639651180534, "learning_rate": 1.6383282497241873e-05, "loss": 0.4544, "step": 20820 }, { "epoch": 2.021152726567048, "grad_norm": 2.1458744835535617, "learning_rate": 1.636705821273282e-05, "loss": 0.4632, "step": 20830 }, { "epoch": 2.02212303512517, "grad_norm": 2.3393805927580997, "learning_rate": 1.6350833928223765e-05, "loss": 0.4686, "step": 20840 }, { "epoch": 2.0230933436832914, "grad_norm": 1.8274491944101148, "learning_rate": 1.6334609643714712e-05, "loss": 0.4509, "step": 20850 }, { "epoch": 2.024063652241413, "grad_norm": 2.5507345485151767, "learning_rate": 1.6318385359205658e-05, "loss": 0.5145, "step": 20860 }, { "epoch": 2.0250339607995342, "grad_norm": 1.878444628312939, "learning_rate": 1.6302161074696608e-05, "loss": 0.449, "step": 20870 }, { "epoch": 2.0260042693576557, "grad_norm": 2.1096395212213235, "learning_rate": 1.6285936790187554e-05, "loss": 0.409, "step": 20880 }, { "epoch": 2.026974577915777, "grad_norm": 1.9597465473748303, "learning_rate": 1.62697125056785e-05, "loss": 0.4909, "step": 20890 }, { "epoch": 2.0279448864738985, "grad_norm": 1.4262456717146317, "learning_rate": 1.6253488221169447e-05, "loss": 0.4933, "step": 20900 }, { "epoch": 2.0289151950320203, "grad_norm": 1.7254635997166883, "learning_rate": 1.6237263936660393e-05, "loss": 0.4581, "step": 20910 }, { "epoch": 2.0298855035901417, "grad_norm": 1.8255091897283775, "learning_rate": 1.622103965215134e-05, "loss": 0.462, "step": 20920 }, { "epoch": 2.030855812148263, "grad_norm": 1.592461465195459, "learning_rate": 1.6204815367642286e-05, "loss": 0.5343, "step": 20930 }, { "epoch": 2.0318261207063846, "grad_norm": 2.4124278045088445, "learning_rate": 1.6188591083133233e-05, "loss": 0.4738, "step": 20940 }, { "epoch": 2.032796429264506, "grad_norm": 1.4707790381425363, "learning_rate": 1.617236679862418e-05, "loss": 0.4183, "step": 20950 }, { "epoch": 2.0337667378226274, "grad_norm": 1.7383346761795593, "learning_rate": 1.615614251411513e-05, "loss": 0.4351, "step": 20960 }, { "epoch": 2.0347370463807493, "grad_norm": 2.292481354882268, "learning_rate": 1.6139918229606075e-05, "loss": 0.4294, "step": 20970 }, { "epoch": 2.0357073549388707, "grad_norm": 2.158168642341278, "learning_rate": 1.612369394509702e-05, "loss": 0.4703, "step": 20980 }, { "epoch": 2.036677663496992, "grad_norm": 2.283161751864384, "learning_rate": 1.6107469660587968e-05, "loss": 0.5128, "step": 20990 }, { "epoch": 2.0376479720551135, "grad_norm": 2.125582882132147, "learning_rate": 1.6091245376078914e-05, "loss": 0.4652, "step": 21000 }, { "epoch": 2.0376479720551135, "eval_loss": 0.6309866905212402, "eval_runtime": 3136.9547, "eval_samples_per_second": 0.571, "eval_steps_per_second": 0.286, "step": 21000 }, { "epoch": 2.038618280613235, "grad_norm": 1.9414092633891158, "learning_rate": 1.607502109156986e-05, "loss": 0.4451, "step": 21010 }, { "epoch": 2.0395885891713563, "grad_norm": 2.0603594802374863, "learning_rate": 1.6058796807060807e-05, "loss": 0.4569, "step": 21020 }, { "epoch": 2.040558897729478, "grad_norm": 1.6323777472844119, "learning_rate": 1.6042572522551757e-05, "loss": 0.4764, "step": 21030 }, { "epoch": 2.0415292062875996, "grad_norm": 1.8941882942793062, "learning_rate": 1.6026348238042703e-05, "loss": 0.4514, "step": 21040 }, { "epoch": 2.042499514845721, "grad_norm": 1.905361739266791, "learning_rate": 1.601012395353365e-05, "loss": 0.4681, "step": 21050 }, { "epoch": 2.0434698234038424, "grad_norm": 2.357513508114439, "learning_rate": 1.59938996690246e-05, "loss": 0.5029, "step": 21060 }, { "epoch": 2.044440131961964, "grad_norm": 2.045526641253892, "learning_rate": 1.5977675384515546e-05, "loss": 0.5031, "step": 21070 }, { "epoch": 2.0454104405200852, "grad_norm": 1.8018214238085957, "learning_rate": 1.5961451100006492e-05, "loss": 0.475, "step": 21080 }, { "epoch": 2.0463807490782067, "grad_norm": 2.157846458503379, "learning_rate": 1.594522681549744e-05, "loss": 0.4922, "step": 21090 }, { "epoch": 2.0473510576363285, "grad_norm": 2.2821214882561773, "learning_rate": 1.5929002530988385e-05, "loss": 0.4361, "step": 21100 }, { "epoch": 2.04832136619445, "grad_norm": 2.4410635323771395, "learning_rate": 1.591277824647933e-05, "loss": 0.4426, "step": 21110 }, { "epoch": 2.0492916747525713, "grad_norm": 1.847883527949176, "learning_rate": 1.5896553961970278e-05, "loss": 0.4998, "step": 21120 }, { "epoch": 2.0502619833106928, "grad_norm": 2.086929164640455, "learning_rate": 1.5880329677461224e-05, "loss": 0.5218, "step": 21130 }, { "epoch": 2.051232291868814, "grad_norm": 1.8955619202400775, "learning_rate": 1.5864105392952174e-05, "loss": 0.519, "step": 21140 }, { "epoch": 2.0522026004269356, "grad_norm": 2.43252621811095, "learning_rate": 1.584788110844312e-05, "loss": 0.4439, "step": 21150 }, { "epoch": 2.0531729089850574, "grad_norm": 2.0827457082492464, "learning_rate": 1.5831656823934067e-05, "loss": 0.5047, "step": 21160 }, { "epoch": 2.054143217543179, "grad_norm": 1.9212860192837349, "learning_rate": 1.5815432539425013e-05, "loss": 0.4655, "step": 21170 }, { "epoch": 2.0551135261013003, "grad_norm": 2.090169452444841, "learning_rate": 1.579920825491596e-05, "loss": 0.4421, "step": 21180 }, { "epoch": 2.0560838346594217, "grad_norm": 1.9503749019739738, "learning_rate": 1.5782983970406906e-05, "loss": 0.442, "step": 21190 }, { "epoch": 2.057054143217543, "grad_norm": 1.724201210329928, "learning_rate": 1.5766759685897852e-05, "loss": 0.3953, "step": 21200 }, { "epoch": 2.0580244517756645, "grad_norm": 2.081302027118628, "learning_rate": 1.57505354013888e-05, "loss": 0.4846, "step": 21210 }, { "epoch": 2.0589947603337864, "grad_norm": 2.5592097491782546, "learning_rate": 1.5734311116879745e-05, "loss": 0.4428, "step": 21220 }, { "epoch": 2.059965068891908, "grad_norm": 1.9807395474177856, "learning_rate": 1.5718086832370695e-05, "loss": 0.4843, "step": 21230 }, { "epoch": 2.060935377450029, "grad_norm": 1.946678515294379, "learning_rate": 1.570186254786164e-05, "loss": 0.5123, "step": 21240 }, { "epoch": 2.0619056860081506, "grad_norm": 1.704583779463954, "learning_rate": 1.5685638263352587e-05, "loss": 0.5018, "step": 21250 }, { "epoch": 2.062875994566272, "grad_norm": 1.9509206749604142, "learning_rate": 1.5669413978843534e-05, "loss": 0.4156, "step": 21260 }, { "epoch": 2.0638463031243934, "grad_norm": 2.0838442571499365, "learning_rate": 1.565318969433448e-05, "loss": 0.4632, "step": 21270 }, { "epoch": 2.064816611682515, "grad_norm": 1.9349794015053616, "learning_rate": 1.5636965409825427e-05, "loss": 0.4493, "step": 21280 }, { "epoch": 2.0657869202406367, "grad_norm": 2.188378390390559, "learning_rate": 1.5620741125316373e-05, "loss": 0.5279, "step": 21290 }, { "epoch": 2.066757228798758, "grad_norm": 1.8328420572385127, "learning_rate": 1.560451684080732e-05, "loss": 0.4611, "step": 21300 }, { "epoch": 2.0677275373568795, "grad_norm": 1.563376875838364, "learning_rate": 1.5588292556298266e-05, "loss": 0.4449, "step": 21310 }, { "epoch": 2.068697845915001, "grad_norm": 2.231697202375159, "learning_rate": 1.5572068271789215e-05, "loss": 0.4763, "step": 21320 }, { "epoch": 2.0696681544731224, "grad_norm": 2.116198927861082, "learning_rate": 1.5555843987280162e-05, "loss": 0.4313, "step": 21330 }, { "epoch": 2.0706384630312438, "grad_norm": 2.3801041085422763, "learning_rate": 1.5539619702771108e-05, "loss": 0.4643, "step": 21340 }, { "epoch": 2.0716087715893656, "grad_norm": 1.8571501672365118, "learning_rate": 1.5523395418262055e-05, "loss": 0.4423, "step": 21350 }, { "epoch": 2.072579080147487, "grad_norm": 1.6559131513123913, "learning_rate": 1.5507171133753e-05, "loss": 0.4437, "step": 21360 }, { "epoch": 2.0735493887056085, "grad_norm": 2.137893390544345, "learning_rate": 1.5490946849243947e-05, "loss": 0.4732, "step": 21370 }, { "epoch": 2.07451969726373, "grad_norm": 1.4742723623250382, "learning_rate": 1.5474722564734894e-05, "loss": 0.412, "step": 21380 }, { "epoch": 2.0754900058218513, "grad_norm": 2.1318829799319543, "learning_rate": 1.545849828022584e-05, "loss": 0.4916, "step": 21390 }, { "epoch": 2.0764603143799727, "grad_norm": 2.307110089522299, "learning_rate": 1.544227399571679e-05, "loss": 0.4124, "step": 21400 }, { "epoch": 2.077430622938094, "grad_norm": 2.3097275184207637, "learning_rate": 1.5426049711207736e-05, "loss": 0.4799, "step": 21410 }, { "epoch": 2.078400931496216, "grad_norm": 1.5854013780070428, "learning_rate": 1.5409825426698686e-05, "loss": 0.4759, "step": 21420 }, { "epoch": 2.0793712400543374, "grad_norm": 2.008961619138051, "learning_rate": 1.5393601142189632e-05, "loss": 0.4844, "step": 21430 }, { "epoch": 2.080341548612459, "grad_norm": 1.848635541993877, "learning_rate": 1.537737685768058e-05, "loss": 0.4558, "step": 21440 }, { "epoch": 2.08131185717058, "grad_norm": 2.3862164923195217, "learning_rate": 1.5361152573171525e-05, "loss": 0.4514, "step": 21450 }, { "epoch": 2.0822821657287016, "grad_norm": 2.322912088388782, "learning_rate": 1.534492828866247e-05, "loss": 0.4507, "step": 21460 }, { "epoch": 2.083252474286823, "grad_norm": 2.1765100761424074, "learning_rate": 1.5328704004153418e-05, "loss": 0.4114, "step": 21470 }, { "epoch": 2.084222782844945, "grad_norm": 1.915448768034906, "learning_rate": 1.5312479719644364e-05, "loss": 0.4794, "step": 21480 }, { "epoch": 2.0851930914030663, "grad_norm": 1.955422403883371, "learning_rate": 1.529625543513531e-05, "loss": 0.485, "step": 21490 }, { "epoch": 2.0861633999611877, "grad_norm": 1.5866416015413762, "learning_rate": 1.528003115062626e-05, "loss": 0.4961, "step": 21500 }, { "epoch": 2.0861633999611877, "eval_loss": 0.6304420232772827, "eval_runtime": 3075.3205, "eval_samples_per_second": 0.583, "eval_steps_per_second": 0.291, "step": 21500 }, { "epoch": 2.087133708519309, "grad_norm": 1.8961317659006691, "learning_rate": 1.5263806866117207e-05, "loss": 0.4803, "step": 21510 }, { "epoch": 2.0881040170774305, "grad_norm": 2.1574061097476633, "learning_rate": 1.5247582581608153e-05, "loss": 0.4897, "step": 21520 }, { "epoch": 2.089074325635552, "grad_norm": 2.080433640436673, "learning_rate": 1.52313582970991e-05, "loss": 0.4623, "step": 21530 }, { "epoch": 2.0900446341936734, "grad_norm": 2.274765795002327, "learning_rate": 1.5215134012590046e-05, "loss": 0.5149, "step": 21540 }, { "epoch": 2.0910149427517952, "grad_norm": 2.514598169769852, "learning_rate": 1.5198909728080992e-05, "loss": 0.4028, "step": 21550 }, { "epoch": 2.0919852513099166, "grad_norm": 2.1324330512224057, "learning_rate": 1.5182685443571939e-05, "loss": 0.5213, "step": 21560 }, { "epoch": 2.092955559868038, "grad_norm": 2.0829108139421106, "learning_rate": 1.5166461159062887e-05, "loss": 0.4338, "step": 21570 }, { "epoch": 2.0939258684261595, "grad_norm": 1.9606921161193192, "learning_rate": 1.5150236874553833e-05, "loss": 0.5347, "step": 21580 }, { "epoch": 2.094896176984281, "grad_norm": 1.9007868794459526, "learning_rate": 1.513401259004478e-05, "loss": 0.4698, "step": 21590 }, { "epoch": 2.0958664855424023, "grad_norm": 2.2070560808992385, "learning_rate": 1.5117788305535726e-05, "loss": 0.4992, "step": 21600 }, { "epoch": 2.096836794100524, "grad_norm": 1.7052502823407865, "learning_rate": 1.5101564021026674e-05, "loss": 0.4448, "step": 21610 }, { "epoch": 2.0978071026586456, "grad_norm": 1.9838818965293705, "learning_rate": 1.508533973651762e-05, "loss": 0.452, "step": 21620 }, { "epoch": 2.098777411216767, "grad_norm": 2.361870038528201, "learning_rate": 1.5069115452008567e-05, "loss": 0.4772, "step": 21630 }, { "epoch": 2.0997477197748884, "grad_norm": 2.3190708510226767, "learning_rate": 1.5052891167499513e-05, "loss": 0.4732, "step": 21640 }, { "epoch": 2.10071802833301, "grad_norm": 2.0341040427547967, "learning_rate": 1.5036666882990461e-05, "loss": 0.5013, "step": 21650 }, { "epoch": 2.101688336891131, "grad_norm": 2.2854031285851337, "learning_rate": 1.5020442598481408e-05, "loss": 0.4587, "step": 21660 }, { "epoch": 2.102658645449253, "grad_norm": 2.171534197336386, "learning_rate": 1.5004218313972354e-05, "loss": 0.4685, "step": 21670 }, { "epoch": 2.1036289540073745, "grad_norm": 2.237731272885117, "learning_rate": 1.49879940294633e-05, "loss": 0.4714, "step": 21680 }, { "epoch": 2.104599262565496, "grad_norm": 1.83625835750722, "learning_rate": 1.4971769744954247e-05, "loss": 0.4858, "step": 21690 }, { "epoch": 2.1055695711236173, "grad_norm": 2.1226910702010935, "learning_rate": 1.4955545460445195e-05, "loss": 0.4735, "step": 21700 }, { "epoch": 2.1065398796817387, "grad_norm": 2.4412848671448764, "learning_rate": 1.4939321175936141e-05, "loss": 0.4955, "step": 21710 }, { "epoch": 2.10751018823986, "grad_norm": 2.171686191103959, "learning_rate": 1.4923096891427088e-05, "loss": 0.4882, "step": 21720 }, { "epoch": 2.1084804967979816, "grad_norm": 1.9028000906788562, "learning_rate": 1.4906872606918034e-05, "loss": 0.5008, "step": 21730 }, { "epoch": 2.1094508053561034, "grad_norm": 2.032206148469331, "learning_rate": 1.4890648322408982e-05, "loss": 0.4688, "step": 21740 }, { "epoch": 2.110421113914225, "grad_norm": 2.2254255720592457, "learning_rate": 1.4874424037899929e-05, "loss": 0.434, "step": 21750 }, { "epoch": 2.1113914224723462, "grad_norm": 2.013615294714695, "learning_rate": 1.4858199753390875e-05, "loss": 0.4429, "step": 21760 }, { "epoch": 2.1123617310304676, "grad_norm": 1.625353905024779, "learning_rate": 1.4841975468881821e-05, "loss": 0.4197, "step": 21770 }, { "epoch": 2.113332039588589, "grad_norm": 2.6956426206670074, "learning_rate": 1.482575118437277e-05, "loss": 0.4412, "step": 21780 }, { "epoch": 2.1143023481467105, "grad_norm": 1.642670039289611, "learning_rate": 1.4809526899863716e-05, "loss": 0.4365, "step": 21790 }, { "epoch": 2.1152726567048323, "grad_norm": 2.047556877474702, "learning_rate": 1.4793302615354662e-05, "loss": 0.4882, "step": 21800 }, { "epoch": 2.1162429652629537, "grad_norm": 2.666095406948268, "learning_rate": 1.4777078330845612e-05, "loss": 0.4885, "step": 21810 }, { "epoch": 2.117213273821075, "grad_norm": 2.0148741202390736, "learning_rate": 1.4760854046336558e-05, "loss": 0.5092, "step": 21820 }, { "epoch": 2.1181835823791966, "grad_norm": 1.5220212421711388, "learning_rate": 1.4744629761827505e-05, "loss": 0.5007, "step": 21830 }, { "epoch": 2.119153890937318, "grad_norm": 2.2566622563684446, "learning_rate": 1.4728405477318453e-05, "loss": 0.4386, "step": 21840 }, { "epoch": 2.1201241994954394, "grad_norm": 2.13286935915719, "learning_rate": 1.47121811928094e-05, "loss": 0.4559, "step": 21850 }, { "epoch": 2.1210945080535613, "grad_norm": 1.8220197546440986, "learning_rate": 1.4695956908300346e-05, "loss": 0.451, "step": 21860 }, { "epoch": 2.1220648166116827, "grad_norm": 2.2468599066846653, "learning_rate": 1.4679732623791292e-05, "loss": 0.446, "step": 21870 }, { "epoch": 2.123035125169804, "grad_norm": 1.8918874944552218, "learning_rate": 1.466350833928224e-05, "loss": 0.459, "step": 21880 }, { "epoch": 2.1240054337279255, "grad_norm": 2.247255412788798, "learning_rate": 1.4647284054773186e-05, "loss": 0.5151, "step": 21890 }, { "epoch": 2.124975742286047, "grad_norm": 1.8291853119382993, "learning_rate": 1.4631059770264133e-05, "loss": 0.4557, "step": 21900 }, { "epoch": 2.1259460508441683, "grad_norm": 2.2000080164154654, "learning_rate": 1.461483548575508e-05, "loss": 0.5106, "step": 21910 }, { "epoch": 2.1269163594022897, "grad_norm": 1.608928034740487, "learning_rate": 1.4598611201246026e-05, "loss": 0.4696, "step": 21920 }, { "epoch": 2.1278866679604116, "grad_norm": 1.7796242187219558, "learning_rate": 1.4582386916736974e-05, "loss": 0.4372, "step": 21930 }, { "epoch": 2.128856976518533, "grad_norm": 1.9355986409124974, "learning_rate": 1.456616263222792e-05, "loss": 0.4601, "step": 21940 }, { "epoch": 2.1298272850766544, "grad_norm": 2.5044478412060776, "learning_rate": 1.4549938347718866e-05, "loss": 0.5153, "step": 21950 }, { "epoch": 2.130797593634776, "grad_norm": 2.1644310902185264, "learning_rate": 1.4533714063209813e-05, "loss": 0.4755, "step": 21960 }, { "epoch": 2.1317679021928972, "grad_norm": 1.9759401921373174, "learning_rate": 1.451748977870076e-05, "loss": 0.5306, "step": 21970 }, { "epoch": 2.1327382107510187, "grad_norm": 2.5662041065947827, "learning_rate": 1.4501265494191707e-05, "loss": 0.4936, "step": 21980 }, { "epoch": 2.13370851930914, "grad_norm": 1.834252286116916, "learning_rate": 1.4485041209682654e-05, "loss": 0.4248, "step": 21990 }, { "epoch": 2.134678827867262, "grad_norm": 2.048099727707521, "learning_rate": 1.44688169251736e-05, "loss": 0.4573, "step": 22000 }, { "epoch": 2.134678827867262, "eval_loss": 0.6303107142448425, "eval_runtime": 3417.8488, "eval_samples_per_second": 0.524, "eval_steps_per_second": 0.262, "step": 22000 }, { "epoch": 2.1356491364253833, "grad_norm": 2.032462630747061, "learning_rate": 1.4452592640664548e-05, "loss": 0.4496, "step": 22010 }, { "epoch": 2.1366194449835048, "grad_norm": 2.0538977423462548, "learning_rate": 1.4436368356155494e-05, "loss": 0.4512, "step": 22020 }, { "epoch": 2.137589753541626, "grad_norm": 2.0559328662146403, "learning_rate": 1.442014407164644e-05, "loss": 0.4469, "step": 22030 }, { "epoch": 2.1385600620997476, "grad_norm": 1.8412343311132753, "learning_rate": 1.4403919787137387e-05, "loss": 0.449, "step": 22040 }, { "epoch": 2.139530370657869, "grad_norm": 2.0513694288656765, "learning_rate": 1.4387695502628334e-05, "loss": 0.4709, "step": 22050 }, { "epoch": 2.140500679215991, "grad_norm": 1.7890079195029862, "learning_rate": 1.4371471218119282e-05, "loss": 0.4419, "step": 22060 }, { "epoch": 2.1414709877741123, "grad_norm": 2.535357630446327, "learning_rate": 1.4355246933610228e-05, "loss": 0.4305, "step": 22070 }, { "epoch": 2.1424412963322337, "grad_norm": 2.2977476166849957, "learning_rate": 1.4339022649101174e-05, "loss": 0.4664, "step": 22080 }, { "epoch": 2.143411604890355, "grad_norm": 1.8778176122292005, "learning_rate": 1.432279836459212e-05, "loss": 0.4321, "step": 22090 }, { "epoch": 2.1443819134484765, "grad_norm": 2.2740974899219526, "learning_rate": 1.4306574080083069e-05, "loss": 0.4582, "step": 22100 }, { "epoch": 2.145352222006598, "grad_norm": 1.7411166330088808, "learning_rate": 1.4290349795574015e-05, "loss": 0.4703, "step": 22110 }, { "epoch": 2.1463225305647198, "grad_norm": 2.270025807621109, "learning_rate": 1.4274125511064962e-05, "loss": 0.462, "step": 22120 }, { "epoch": 2.147292839122841, "grad_norm": 2.047404098376008, "learning_rate": 1.4257901226555908e-05, "loss": 0.4572, "step": 22130 }, { "epoch": 2.1482631476809626, "grad_norm": 1.9025461949345281, "learning_rate": 1.4241676942046856e-05, "loss": 0.4523, "step": 22140 }, { "epoch": 2.149233456239084, "grad_norm": 2.41657182633304, "learning_rate": 1.4225452657537803e-05, "loss": 0.4485, "step": 22150 }, { "epoch": 2.1502037647972054, "grad_norm": 2.1984178815028246, "learning_rate": 1.4209228373028749e-05, "loss": 0.4744, "step": 22160 }, { "epoch": 2.151174073355327, "grad_norm": 2.1671082483900044, "learning_rate": 1.4193004088519695e-05, "loss": 0.4277, "step": 22170 }, { "epoch": 2.1521443819134483, "grad_norm": 2.2526638629347118, "learning_rate": 1.4176779804010643e-05, "loss": 0.4078, "step": 22180 }, { "epoch": 2.15311469047157, "grad_norm": 2.4484821608494665, "learning_rate": 1.416055551950159e-05, "loss": 0.4726, "step": 22190 }, { "epoch": 2.1540849990296915, "grad_norm": 2.145545596752741, "learning_rate": 1.414433123499254e-05, "loss": 0.5181, "step": 22200 }, { "epoch": 2.155055307587813, "grad_norm": 2.2593836123374, "learning_rate": 1.4128106950483486e-05, "loss": 0.4986, "step": 22210 }, { "epoch": 2.1560256161459344, "grad_norm": 1.8007881267775863, "learning_rate": 1.4111882665974432e-05, "loss": 0.4483, "step": 22220 }, { "epoch": 2.1569959247040558, "grad_norm": 1.348362904445229, "learning_rate": 1.4095658381465379e-05, "loss": 0.4637, "step": 22230 }, { "epoch": 2.157966233262177, "grad_norm": 2.1469770203677445, "learning_rate": 1.4079434096956327e-05, "loss": 0.494, "step": 22240 }, { "epoch": 2.158936541820299, "grad_norm": 2.337402150331418, "learning_rate": 1.4063209812447273e-05, "loss": 0.4761, "step": 22250 }, { "epoch": 2.1599068503784205, "grad_norm": 1.7961706912599362, "learning_rate": 1.404698552793822e-05, "loss": 0.4683, "step": 22260 }, { "epoch": 2.160877158936542, "grad_norm": 1.6086385155901042, "learning_rate": 1.4030761243429166e-05, "loss": 0.4305, "step": 22270 }, { "epoch": 2.1618474674946633, "grad_norm": 1.853121432266295, "learning_rate": 1.4014536958920112e-05, "loss": 0.4808, "step": 22280 }, { "epoch": 2.1628177760527847, "grad_norm": 2.1594671768519094, "learning_rate": 1.399831267441106e-05, "loss": 0.4279, "step": 22290 }, { "epoch": 2.163788084610906, "grad_norm": 1.8445913886566945, "learning_rate": 1.3982088389902007e-05, "loss": 0.4905, "step": 22300 }, { "epoch": 2.164758393169028, "grad_norm": 1.8975990263034965, "learning_rate": 1.3965864105392953e-05, "loss": 0.4888, "step": 22310 }, { "epoch": 2.1657287017271494, "grad_norm": 2.015470924856379, "learning_rate": 1.39496398208839e-05, "loss": 0.5232, "step": 22320 }, { "epoch": 2.166699010285271, "grad_norm": 2.4943071641031698, "learning_rate": 1.3933415536374848e-05, "loss": 0.4827, "step": 22330 }, { "epoch": 2.167669318843392, "grad_norm": 1.5900299707664105, "learning_rate": 1.3917191251865794e-05, "loss": 0.4963, "step": 22340 }, { "epoch": 2.1686396274015136, "grad_norm": 1.8705765102756569, "learning_rate": 1.390096696735674e-05, "loss": 0.4389, "step": 22350 }, { "epoch": 2.169609935959635, "grad_norm": 1.9703821664953534, "learning_rate": 1.3884742682847687e-05, "loss": 0.3945, "step": 22360 }, { "epoch": 2.1705802445177564, "grad_norm": 1.6647445519639283, "learning_rate": 1.3868518398338635e-05, "loss": 0.4163, "step": 22370 }, { "epoch": 2.1715505530758783, "grad_norm": 1.617333797152806, "learning_rate": 1.3852294113829581e-05, "loss": 0.4943, "step": 22380 }, { "epoch": 2.1725208616339997, "grad_norm": 2.1150619610025796, "learning_rate": 1.3836069829320528e-05, "loss": 0.4941, "step": 22390 }, { "epoch": 2.173491170192121, "grad_norm": 2.70908196165092, "learning_rate": 1.3819845544811474e-05, "loss": 0.448, "step": 22400 }, { "epoch": 2.1744614787502425, "grad_norm": 2.195631441394506, "learning_rate": 1.3803621260302422e-05, "loss": 0.4611, "step": 22410 }, { "epoch": 2.175431787308364, "grad_norm": 1.602301760768748, "learning_rate": 1.3787396975793368e-05, "loss": 0.4837, "step": 22420 }, { "epoch": 2.1764020958664854, "grad_norm": 1.7676765995955546, "learning_rate": 1.3771172691284315e-05, "loss": 0.4798, "step": 22430 }, { "epoch": 2.177372404424607, "grad_norm": 2.211793623579749, "learning_rate": 1.3754948406775261e-05, "loss": 0.4607, "step": 22440 }, { "epoch": 2.1783427129827286, "grad_norm": 1.9526273269630037, "learning_rate": 1.3738724122266208e-05, "loss": 0.4638, "step": 22450 }, { "epoch": 2.17931302154085, "grad_norm": 2.063947829078263, "learning_rate": 1.3722499837757156e-05, "loss": 0.4548, "step": 22460 }, { "epoch": 2.1802833300989715, "grad_norm": 2.127208435257803, "learning_rate": 1.3706275553248102e-05, "loss": 0.4332, "step": 22470 }, { "epoch": 2.181253638657093, "grad_norm": 2.098766619473584, "learning_rate": 1.3690051268739048e-05, "loss": 0.4822, "step": 22480 }, { "epoch": 2.1822239472152143, "grad_norm": 1.5960382905615234, "learning_rate": 1.3673826984229995e-05, "loss": 0.4394, "step": 22490 }, { "epoch": 2.183194255773336, "grad_norm": 2.5521137154858087, "learning_rate": 1.3657602699720943e-05, "loss": 0.4568, "step": 22500 }, { "epoch": 2.183194255773336, "eval_loss": 0.6301902532577515, "eval_runtime": 3074.8498, "eval_samples_per_second": 0.583, "eval_steps_per_second": 0.291, "step": 22500 }, { "epoch": 2.1841645643314576, "grad_norm": 1.8964546731319356, "learning_rate": 1.364137841521189e-05, "loss": 0.4569, "step": 22510 }, { "epoch": 2.185134872889579, "grad_norm": 2.1836501057996918, "learning_rate": 1.3625154130702836e-05, "loss": 0.4497, "step": 22520 }, { "epoch": 2.1861051814477004, "grad_norm": 1.6786624803632986, "learning_rate": 1.3608929846193782e-05, "loss": 0.4229, "step": 22530 }, { "epoch": 2.187075490005822, "grad_norm": 2.1475126662304675, "learning_rate": 1.359270556168473e-05, "loss": 0.4412, "step": 22540 }, { "epoch": 2.188045798563943, "grad_norm": 2.322023881303393, "learning_rate": 1.3576481277175676e-05, "loss": 0.4235, "step": 22550 }, { "epoch": 2.1890161071220646, "grad_norm": 2.208490633488266, "learning_rate": 1.3560256992666623e-05, "loss": 0.4997, "step": 22560 }, { "epoch": 2.1899864156801865, "grad_norm": 1.869675214539207, "learning_rate": 1.354403270815757e-05, "loss": 0.4635, "step": 22570 }, { "epoch": 2.190956724238308, "grad_norm": 1.7398189038479703, "learning_rate": 1.3527808423648516e-05, "loss": 0.4128, "step": 22580 }, { "epoch": 2.1919270327964293, "grad_norm": 2.266171424803214, "learning_rate": 1.3511584139139465e-05, "loss": 0.4651, "step": 22590 }, { "epoch": 2.1928973413545507, "grad_norm": 2.3614429446369063, "learning_rate": 1.3495359854630413e-05, "loss": 0.4123, "step": 22600 }, { "epoch": 2.193867649912672, "grad_norm": 1.724551648002317, "learning_rate": 1.347913557012136e-05, "loss": 0.4481, "step": 22610 }, { "epoch": 2.1948379584707935, "grad_norm": 2.0558570962807945, "learning_rate": 1.3462911285612306e-05, "loss": 0.4431, "step": 22620 }, { "epoch": 2.195808267028915, "grad_norm": 1.943776934136987, "learning_rate": 1.3446687001103253e-05, "loss": 0.5128, "step": 22630 }, { "epoch": 2.196778575587037, "grad_norm": 1.958173689925549, "learning_rate": 1.34304627165942e-05, "loss": 0.4628, "step": 22640 }, { "epoch": 2.1977488841451582, "grad_norm": 1.990366090746634, "learning_rate": 1.3414238432085147e-05, "loss": 0.459, "step": 22650 }, { "epoch": 2.1987191927032796, "grad_norm": 1.8373680505548435, "learning_rate": 1.3398014147576093e-05, "loss": 0.4397, "step": 22660 }, { "epoch": 2.199689501261401, "grad_norm": 1.8078209399386682, "learning_rate": 1.338178986306704e-05, "loss": 0.5215, "step": 22670 }, { "epoch": 2.2006598098195225, "grad_norm": 2.400832940310904, "learning_rate": 1.3365565578557986e-05, "loss": 0.4926, "step": 22680 }, { "epoch": 2.201630118377644, "grad_norm": 2.055045167286068, "learning_rate": 1.3349341294048934e-05, "loss": 0.4461, "step": 22690 }, { "epoch": 2.2026004269357657, "grad_norm": 1.8144420639570567, "learning_rate": 1.333311700953988e-05, "loss": 0.4112, "step": 22700 }, { "epoch": 2.203570735493887, "grad_norm": 2.391519552042341, "learning_rate": 1.3316892725030827e-05, "loss": 0.439, "step": 22710 }, { "epoch": 2.2045410440520086, "grad_norm": 1.563384599944884, "learning_rate": 1.3300668440521773e-05, "loss": 0.4694, "step": 22720 }, { "epoch": 2.20551135261013, "grad_norm": 2.3546727224779715, "learning_rate": 1.3284444156012721e-05, "loss": 0.4355, "step": 22730 }, { "epoch": 2.2064816611682514, "grad_norm": 1.832975462434467, "learning_rate": 1.3268219871503668e-05, "loss": 0.4984, "step": 22740 }, { "epoch": 2.207451969726373, "grad_norm": 1.7975073035604554, "learning_rate": 1.3251995586994614e-05, "loss": 0.4584, "step": 22750 }, { "epoch": 2.2084222782844947, "grad_norm": 2.243306278714619, "learning_rate": 1.323577130248556e-05, "loss": 0.4365, "step": 22760 }, { "epoch": 2.209392586842616, "grad_norm": 2.0297904158946043, "learning_rate": 1.3219547017976509e-05, "loss": 0.5148, "step": 22770 }, { "epoch": 2.2103628954007375, "grad_norm": 1.6322562865594754, "learning_rate": 1.3203322733467455e-05, "loss": 0.4782, "step": 22780 }, { "epoch": 2.211333203958859, "grad_norm": 2.277603717413092, "learning_rate": 1.3187098448958401e-05, "loss": 0.4899, "step": 22790 }, { "epoch": 2.2123035125169803, "grad_norm": 1.2452738117030233, "learning_rate": 1.3170874164449348e-05, "loss": 0.4459, "step": 22800 }, { "epoch": 2.2132738210751017, "grad_norm": 2.3624119652920177, "learning_rate": 1.3154649879940294e-05, "loss": 0.5033, "step": 22810 }, { "epoch": 2.214244129633223, "grad_norm": 1.6861367814447958, "learning_rate": 1.3138425595431242e-05, "loss": 0.4434, "step": 22820 }, { "epoch": 2.215214438191345, "grad_norm": 2.5489904753238166, "learning_rate": 1.3122201310922189e-05, "loss": 0.4465, "step": 22830 }, { "epoch": 2.2161847467494664, "grad_norm": 1.5652883952842678, "learning_rate": 1.3105977026413135e-05, "loss": 0.4648, "step": 22840 }, { "epoch": 2.217155055307588, "grad_norm": 2.106609892171019, "learning_rate": 1.3089752741904081e-05, "loss": 0.4809, "step": 22850 }, { "epoch": 2.2181253638657092, "grad_norm": 2.7417498454164653, "learning_rate": 1.307352845739503e-05, "loss": 0.4652, "step": 22860 }, { "epoch": 2.2190956724238307, "grad_norm": 2.0375355388485645, "learning_rate": 1.3057304172885976e-05, "loss": 0.499, "step": 22870 }, { "epoch": 2.220065980981952, "grad_norm": 1.6341471214683112, "learning_rate": 1.3041079888376922e-05, "loss": 0.4935, "step": 22880 }, { "epoch": 2.221036289540074, "grad_norm": 2.3570884682492346, "learning_rate": 1.3024855603867869e-05, "loss": 0.4884, "step": 22890 }, { "epoch": 2.2220065980981953, "grad_norm": 1.9493074304368556, "learning_rate": 1.3008631319358817e-05, "loss": 0.4838, "step": 22900 }, { "epoch": 2.2229769066563168, "grad_norm": 2.1315651431097224, "learning_rate": 1.2992407034849763e-05, "loss": 0.4978, "step": 22910 }, { "epoch": 2.223947215214438, "grad_norm": 1.8843986392844572, "learning_rate": 1.297618275034071e-05, "loss": 0.4922, "step": 22920 }, { "epoch": 2.2249175237725596, "grad_norm": 2.0055582988037313, "learning_rate": 1.2959958465831656e-05, "loss": 0.4702, "step": 22930 }, { "epoch": 2.225887832330681, "grad_norm": 1.8513569200366902, "learning_rate": 1.2943734181322604e-05, "loss": 0.467, "step": 22940 }, { "epoch": 2.226858140888803, "grad_norm": 2.3018937212184682, "learning_rate": 1.292750989681355e-05, "loss": 0.4632, "step": 22950 }, { "epoch": 2.2278284494469243, "grad_norm": 2.1028486436310905, "learning_rate": 1.2911285612304497e-05, "loss": 0.4329, "step": 22960 }, { "epoch": 2.2287987580050457, "grad_norm": 2.3891744137342994, "learning_rate": 1.2895061327795443e-05, "loss": 0.4549, "step": 22970 }, { "epoch": 2.229769066563167, "grad_norm": 1.8338746725123478, "learning_rate": 1.2878837043286393e-05, "loss": 0.4518, "step": 22980 }, { "epoch": 2.2307393751212885, "grad_norm": 2.5499388804291483, "learning_rate": 1.286261275877734e-05, "loss": 0.4852, "step": 22990 }, { "epoch": 2.23170968367941, "grad_norm": 1.6648341489250056, "learning_rate": 1.2846388474268287e-05, "loss": 0.4266, "step": 23000 }, { "epoch": 2.23170968367941, "eval_loss": 0.6283465623855591, "eval_runtime": 3074.8923, "eval_samples_per_second": 0.583, "eval_steps_per_second": 0.291, "step": 23000 }, { "epoch": 2.2326799922375313, "grad_norm": 2.0022361963538584, "learning_rate": 1.2830164189759234e-05, "loss": 0.5047, "step": 23010 }, { "epoch": 2.233650300795653, "grad_norm": 1.5655335338455796, "learning_rate": 1.281393990525018e-05, "loss": 0.4699, "step": 23020 }, { "epoch": 2.2346206093537746, "grad_norm": 2.1600874630795452, "learning_rate": 1.2797715620741127e-05, "loss": 0.4684, "step": 23030 }, { "epoch": 2.235590917911896, "grad_norm": 1.6958961004239297, "learning_rate": 1.2781491336232073e-05, "loss": 0.4769, "step": 23040 }, { "epoch": 2.2365612264700174, "grad_norm": 2.290659253729356, "learning_rate": 1.2765267051723021e-05, "loss": 0.5084, "step": 23050 }, { "epoch": 2.237531535028139, "grad_norm": 2.250294105691862, "learning_rate": 1.2749042767213967e-05, "loss": 0.4562, "step": 23060 }, { "epoch": 2.2385018435862603, "grad_norm": 1.93879453393384, "learning_rate": 1.2732818482704914e-05, "loss": 0.4687, "step": 23070 }, { "epoch": 2.239472152144382, "grad_norm": 2.3403124810757805, "learning_rate": 1.271659419819586e-05, "loss": 0.493, "step": 23080 }, { "epoch": 2.2404424607025035, "grad_norm": 1.800829852558906, "learning_rate": 1.2700369913686808e-05, "loss": 0.4786, "step": 23090 }, { "epoch": 2.241412769260625, "grad_norm": 2.0849042725619165, "learning_rate": 1.2684145629177755e-05, "loss": 0.4682, "step": 23100 }, { "epoch": 2.2423830778187464, "grad_norm": 2.209642518744148, "learning_rate": 1.2667921344668701e-05, "loss": 0.4705, "step": 23110 }, { "epoch": 2.2433533863768678, "grad_norm": 2.04809833807039, "learning_rate": 1.2651697060159647e-05, "loss": 0.4652, "step": 23120 }, { "epoch": 2.244323694934989, "grad_norm": 2.134292380445789, "learning_rate": 1.2635472775650595e-05, "loss": 0.4375, "step": 23130 }, { "epoch": 2.245294003493111, "grad_norm": 2.0308679578505466, "learning_rate": 1.2619248491141542e-05, "loss": 0.4256, "step": 23140 }, { "epoch": 2.2462643120512324, "grad_norm": 1.7740882276229983, "learning_rate": 1.2603024206632488e-05, "loss": 0.4754, "step": 23150 }, { "epoch": 2.247234620609354, "grad_norm": 2.845899626015525, "learning_rate": 1.2586799922123435e-05, "loss": 0.4465, "step": 23160 }, { "epoch": 2.2482049291674753, "grad_norm": 1.6656047995660936, "learning_rate": 1.2570575637614383e-05, "loss": 0.4645, "step": 23170 }, { "epoch": 2.2491752377255967, "grad_norm": 2.1317439051006675, "learning_rate": 1.2554351353105329e-05, "loss": 0.4215, "step": 23180 }, { "epoch": 2.250145546283718, "grad_norm": 1.8180573164715403, "learning_rate": 1.2538127068596275e-05, "loss": 0.448, "step": 23190 }, { "epoch": 2.2511158548418395, "grad_norm": 1.7997832378510108, "learning_rate": 1.2521902784087222e-05, "loss": 0.5032, "step": 23200 }, { "epoch": 2.2520861633999614, "grad_norm": 1.97843541131739, "learning_rate": 1.2505678499578168e-05, "loss": 0.4577, "step": 23210 }, { "epoch": 2.253056471958083, "grad_norm": 2.1980311567337045, "learning_rate": 1.2489454215069116e-05, "loss": 0.4492, "step": 23220 }, { "epoch": 2.254026780516204, "grad_norm": 1.888365535194607, "learning_rate": 1.2473229930560063e-05, "loss": 0.4416, "step": 23230 }, { "epoch": 2.2549970890743256, "grad_norm": 1.7544850776435907, "learning_rate": 1.2457005646051009e-05, "loss": 0.4803, "step": 23240 }, { "epoch": 2.255967397632447, "grad_norm": 1.8829778599554805, "learning_rate": 1.2440781361541955e-05, "loss": 0.4227, "step": 23250 }, { "epoch": 2.2569377061905684, "grad_norm": 2.1628337950810956, "learning_rate": 1.2424557077032903e-05, "loss": 0.4467, "step": 23260 }, { "epoch": 2.25790801474869, "grad_norm": 2.1336139136695254, "learning_rate": 1.240833279252385e-05, "loss": 0.4798, "step": 23270 }, { "epoch": 2.2588783233068117, "grad_norm": 2.3462272747330277, "learning_rate": 1.2392108508014798e-05, "loss": 0.4894, "step": 23280 }, { "epoch": 2.259848631864933, "grad_norm": 1.9473113253648615, "learning_rate": 1.2375884223505744e-05, "loss": 0.4353, "step": 23290 }, { "epoch": 2.2608189404230545, "grad_norm": 2.357093728494574, "learning_rate": 1.235965993899669e-05, "loss": 0.4508, "step": 23300 }, { "epoch": 2.261789248981176, "grad_norm": 2.0651529569693468, "learning_rate": 1.2343435654487639e-05, "loss": 0.5106, "step": 23310 }, { "epoch": 2.2627595575392974, "grad_norm": 1.8536850282087043, "learning_rate": 1.2327211369978585e-05, "loss": 0.446, "step": 23320 }, { "epoch": 2.263729866097419, "grad_norm": 2.129566666742622, "learning_rate": 1.2310987085469532e-05, "loss": 0.4467, "step": 23330 }, { "epoch": 2.2647001746555406, "grad_norm": 1.73183433586886, "learning_rate": 1.2294762800960478e-05, "loss": 0.4712, "step": 23340 }, { "epoch": 2.265670483213662, "grad_norm": 1.5251850887891814, "learning_rate": 1.2278538516451426e-05, "loss": 0.5013, "step": 23350 }, { "epoch": 2.2666407917717835, "grad_norm": 2.184835227449666, "learning_rate": 1.2262314231942372e-05, "loss": 0.4714, "step": 23360 }, { "epoch": 2.267611100329905, "grad_norm": 2.278133559610058, "learning_rate": 1.2246089947433319e-05, "loss": 0.4707, "step": 23370 }, { "epoch": 2.2685814088880263, "grad_norm": 2.043400099097976, "learning_rate": 1.2229865662924265e-05, "loss": 0.4398, "step": 23380 }, { "epoch": 2.2695517174461477, "grad_norm": 2.1473483931604145, "learning_rate": 1.2213641378415212e-05, "loss": 0.4229, "step": 23390 }, { "epoch": 2.2705220260042696, "grad_norm": 1.9230359161534956, "learning_rate": 1.219741709390616e-05, "loss": 0.425, "step": 23400 }, { "epoch": 2.271492334562391, "grad_norm": 2.0484743523955773, "learning_rate": 1.2181192809397106e-05, "loss": 0.4034, "step": 23410 }, { "epoch": 2.2724626431205124, "grad_norm": 2.018457750509176, "learning_rate": 1.2164968524888052e-05, "loss": 0.5061, "step": 23420 }, { "epoch": 2.273432951678634, "grad_norm": 2.0919643594364494, "learning_rate": 1.2148744240378999e-05, "loss": 0.432, "step": 23430 }, { "epoch": 2.274403260236755, "grad_norm": 2.4420554524762395, "learning_rate": 1.2132519955869947e-05, "loss": 0.4734, "step": 23440 }, { "epoch": 2.2753735687948766, "grad_norm": 2.2048878285052034, "learning_rate": 1.2116295671360893e-05, "loss": 0.5, "step": 23450 }, { "epoch": 2.276343877352998, "grad_norm": 2.385315906124704, "learning_rate": 1.210007138685184e-05, "loss": 0.3946, "step": 23460 }, { "epoch": 2.27731418591112, "grad_norm": 2.1039427863509936, "learning_rate": 1.2083847102342788e-05, "loss": 0.4509, "step": 23470 }, { "epoch": 2.2782844944692413, "grad_norm": 2.1731393161499977, "learning_rate": 1.2067622817833734e-05, "loss": 0.5216, "step": 23480 }, { "epoch": 2.2792548030273627, "grad_norm": 2.133407830915671, "learning_rate": 1.2051398533324682e-05, "loss": 0.4652, "step": 23490 }, { "epoch": 2.280225111585484, "grad_norm": 1.8143906080604477, "learning_rate": 1.2035174248815629e-05, "loss": 0.4608, "step": 23500 }, { "epoch": 2.280225111585484, "eval_loss": 0.6306902766227722, "eval_runtime": 3119.399, "eval_samples_per_second": 0.574, "eval_steps_per_second": 0.287, "step": 23500 }, { "epoch": 2.2811954201436055, "grad_norm": 2.2848754365231545, "learning_rate": 1.2018949964306575e-05, "loss": 0.5139, "step": 23510 }, { "epoch": 2.282165728701727, "grad_norm": 2.354718632997715, "learning_rate": 1.2002725679797521e-05, "loss": 0.4255, "step": 23520 }, { "epoch": 2.283136037259849, "grad_norm": 2.256014847978423, "learning_rate": 1.198650139528847e-05, "loss": 0.4393, "step": 23530 }, { "epoch": 2.2841063458179702, "grad_norm": 2.016243274514756, "learning_rate": 1.1970277110779416e-05, "loss": 0.4194, "step": 23540 }, { "epoch": 2.2850766543760916, "grad_norm": 2.4759209758692426, "learning_rate": 1.1954052826270362e-05, "loss": 0.4757, "step": 23550 }, { "epoch": 2.286046962934213, "grad_norm": 2.2547867685912486, "learning_rate": 1.1937828541761309e-05, "loss": 0.4511, "step": 23560 }, { "epoch": 2.2870172714923345, "grad_norm": 1.9593024304910214, "learning_rate": 1.1921604257252255e-05, "loss": 0.5043, "step": 23570 }, { "epoch": 2.287987580050456, "grad_norm": 2.2107379578337567, "learning_rate": 1.1905379972743203e-05, "loss": 0.4349, "step": 23580 }, { "epoch": 2.2889578886085777, "grad_norm": 1.8779808167419767, "learning_rate": 1.188915568823415e-05, "loss": 0.4063, "step": 23590 }, { "epoch": 2.289928197166699, "grad_norm": 2.1111765494708052, "learning_rate": 1.1872931403725096e-05, "loss": 0.4327, "step": 23600 }, { "epoch": 2.2908985057248206, "grad_norm": 1.906738344577065, "learning_rate": 1.1856707119216042e-05, "loss": 0.4676, "step": 23610 }, { "epoch": 2.291868814282942, "grad_norm": 2.6990569903001167, "learning_rate": 1.184048283470699e-05, "loss": 0.4759, "step": 23620 }, { "epoch": 2.2928391228410634, "grad_norm": 2.39173938568666, "learning_rate": 1.1824258550197937e-05, "loss": 0.4565, "step": 23630 }, { "epoch": 2.293809431399185, "grad_norm": 2.264327763939328, "learning_rate": 1.1808034265688883e-05, "loss": 0.5, "step": 23640 }, { "epoch": 2.294779739957306, "grad_norm": 2.5489345132458494, "learning_rate": 1.179180998117983e-05, "loss": 0.4545, "step": 23650 }, { "epoch": 2.295750048515428, "grad_norm": 1.5435526436096745, "learning_rate": 1.1775585696670777e-05, "loss": 0.444, "step": 23660 }, { "epoch": 2.2967203570735495, "grad_norm": 2.096262775761873, "learning_rate": 1.1759361412161725e-05, "loss": 0.4651, "step": 23670 }, { "epoch": 2.297690665631671, "grad_norm": 2.190613074347611, "learning_rate": 1.1743137127652672e-05, "loss": 0.5474, "step": 23680 }, { "epoch": 2.2986609741897923, "grad_norm": 2.077801881915876, "learning_rate": 1.1726912843143618e-05, "loss": 0.4762, "step": 23690 }, { "epoch": 2.2996312827479137, "grad_norm": 2.01599660916335, "learning_rate": 1.1710688558634565e-05, "loss": 0.4819, "step": 23700 }, { "epoch": 2.300601591306035, "grad_norm": 1.7711529107558897, "learning_rate": 1.1694464274125513e-05, "loss": 0.5082, "step": 23710 }, { "epoch": 2.3015718998641566, "grad_norm": 2.015443958364855, "learning_rate": 1.1678239989616459e-05, "loss": 0.4578, "step": 23720 }, { "epoch": 2.3025422084222784, "grad_norm": 1.6478971592533067, "learning_rate": 1.1662015705107405e-05, "loss": 0.4285, "step": 23730 }, { "epoch": 2.3035125169804, "grad_norm": 1.722295133645004, "learning_rate": 1.1645791420598352e-05, "loss": 0.4678, "step": 23740 }, { "epoch": 2.3044828255385212, "grad_norm": 2.004493677850927, "learning_rate": 1.16295671360893e-05, "loss": 0.412, "step": 23750 }, { "epoch": 2.3054531340966427, "grad_norm": 2.45778069486736, "learning_rate": 1.1613342851580246e-05, "loss": 0.5124, "step": 23760 }, { "epoch": 2.306423442654764, "grad_norm": 2.040288471406675, "learning_rate": 1.1597118567071193e-05, "loss": 0.448, "step": 23770 }, { "epoch": 2.307393751212886, "grad_norm": 1.995303973637335, "learning_rate": 1.1580894282562139e-05, "loss": 0.5022, "step": 23780 }, { "epoch": 2.3083640597710073, "grad_norm": 2.080262776846013, "learning_rate": 1.1564669998053085e-05, "loss": 0.4962, "step": 23790 }, { "epoch": 2.3093343683291288, "grad_norm": 2.2695683008172756, "learning_rate": 1.1548445713544034e-05, "loss": 0.5149, "step": 23800 }, { "epoch": 2.31030467688725, "grad_norm": 2.2271193293614724, "learning_rate": 1.153222142903498e-05, "loss": 0.4527, "step": 23810 }, { "epoch": 2.3112749854453716, "grad_norm": 1.8037542108398639, "learning_rate": 1.1515997144525926e-05, "loss": 0.4569, "step": 23820 }, { "epoch": 2.312245294003493, "grad_norm": 1.8828460178890125, "learning_rate": 1.1499772860016873e-05, "loss": 0.446, "step": 23830 }, { "epoch": 2.3132156025616144, "grad_norm": 2.148750511684889, "learning_rate": 1.148354857550782e-05, "loss": 0.4258, "step": 23840 }, { "epoch": 2.3141859111197363, "grad_norm": 1.9299282888568785, "learning_rate": 1.1467324290998767e-05, "loss": 0.443, "step": 23850 }, { "epoch": 2.3151562196778577, "grad_norm": 2.306570936387397, "learning_rate": 1.1451100006489715e-05, "loss": 0.4742, "step": 23860 }, { "epoch": 2.316126528235979, "grad_norm": 1.652399835965133, "learning_rate": 1.1434875721980662e-05, "loss": 0.4434, "step": 23870 }, { "epoch": 2.3170968367941005, "grad_norm": 1.8666656145624976, "learning_rate": 1.1418651437471608e-05, "loss": 0.4165, "step": 23880 }, { "epoch": 2.318067145352222, "grad_norm": 1.799707347697279, "learning_rate": 1.1402427152962556e-05, "loss": 0.4525, "step": 23890 }, { "epoch": 2.3190374539103433, "grad_norm": 1.8900931059214818, "learning_rate": 1.1386202868453502e-05, "loss": 0.4675, "step": 23900 }, { "epoch": 2.3200077624684647, "grad_norm": 2.2479167053649896, "learning_rate": 1.1369978583944449e-05, "loss": 0.4624, "step": 23910 }, { "epoch": 2.3209780710265866, "grad_norm": 2.2784052431273616, "learning_rate": 1.1353754299435395e-05, "loss": 0.4219, "step": 23920 }, { "epoch": 2.321948379584708, "grad_norm": 2.1485033156725417, "learning_rate": 1.1337530014926343e-05, "loss": 0.4118, "step": 23930 }, { "epoch": 2.3229186881428294, "grad_norm": 2.0610231375794075, "learning_rate": 1.132130573041729e-05, "loss": 0.4672, "step": 23940 }, { "epoch": 2.323888996700951, "grad_norm": 1.8373253686424216, "learning_rate": 1.1305081445908236e-05, "loss": 0.4892, "step": 23950 }, { "epoch": 2.3248593052590723, "grad_norm": 2.0784968854146375, "learning_rate": 1.1288857161399182e-05, "loss": 0.426, "step": 23960 }, { "epoch": 2.325829613817194, "grad_norm": 1.8246399051495223, "learning_rate": 1.1272632876890129e-05, "loss": 0.4707, "step": 23970 }, { "epoch": 2.3267999223753155, "grad_norm": 1.8607594933847929, "learning_rate": 1.1256408592381077e-05, "loss": 0.4787, "step": 23980 }, { "epoch": 2.327770230933437, "grad_norm": 2.1338034995885673, "learning_rate": 1.1240184307872023e-05, "loss": 0.4188, "step": 23990 }, { "epoch": 2.3287405394915583, "grad_norm": 1.691316437747956, "learning_rate": 1.122396002336297e-05, "loss": 0.4683, "step": 24000 }, { "epoch": 2.3287405394915583, "eval_loss": 0.6269034147262573, "eval_runtime": 3077.756, "eval_samples_per_second": 0.582, "eval_steps_per_second": 0.291, "step": 24000 }, { "epoch": 2.3297108480496798, "grad_norm": 2.040492940843433, "learning_rate": 1.1207735738853916e-05, "loss": 0.4569, "step": 24010 }, { "epoch": 2.330681156607801, "grad_norm": 1.7971740640523082, "learning_rate": 1.1191511454344864e-05, "loss": 0.4495, "step": 24020 }, { "epoch": 2.3316514651659226, "grad_norm": 1.939848570121438, "learning_rate": 1.117528716983581e-05, "loss": 0.4794, "step": 24030 }, { "epoch": 2.3326217737240444, "grad_norm": 2.032618108671819, "learning_rate": 1.1159062885326757e-05, "loss": 0.4446, "step": 24040 }, { "epoch": 2.333592082282166, "grad_norm": 1.9156260447966438, "learning_rate": 1.1142838600817703e-05, "loss": 0.4687, "step": 24050 }, { "epoch": 2.3345623908402873, "grad_norm": 1.9351404363288518, "learning_rate": 1.1126614316308651e-05, "loss": 0.5079, "step": 24060 }, { "epoch": 2.3355326993984087, "grad_norm": 1.9150652539150566, "learning_rate": 1.11103900317996e-05, "loss": 0.4673, "step": 24070 }, { "epoch": 2.33650300795653, "grad_norm": 1.8679864315268337, "learning_rate": 1.1094165747290546e-05, "loss": 0.4291, "step": 24080 }, { "epoch": 2.3374733165146515, "grad_norm": 2.414373392352731, "learning_rate": 1.1077941462781492e-05, "loss": 0.4738, "step": 24090 }, { "epoch": 2.338443625072773, "grad_norm": 2.5480843998856466, "learning_rate": 1.1061717178272439e-05, "loss": 0.4281, "step": 24100 }, { "epoch": 2.339413933630895, "grad_norm": 1.7367372410706, "learning_rate": 1.1045492893763387e-05, "loss": 0.483, "step": 24110 }, { "epoch": 2.340384242189016, "grad_norm": 1.8020643362909123, "learning_rate": 1.1029268609254333e-05, "loss": 0.4534, "step": 24120 }, { "epoch": 2.3413545507471376, "grad_norm": 2.0532213004634117, "learning_rate": 1.101304432474528e-05, "loss": 0.4691, "step": 24130 }, { "epoch": 2.342324859305259, "grad_norm": 1.825257767760241, "learning_rate": 1.0996820040236226e-05, "loss": 0.488, "step": 24140 }, { "epoch": 2.3432951678633804, "grad_norm": 2.0243831093236513, "learning_rate": 1.0980595755727172e-05, "loss": 0.4846, "step": 24150 }, { "epoch": 2.344265476421502, "grad_norm": 2.249331730728062, "learning_rate": 1.096437147121812e-05, "loss": 0.4578, "step": 24160 }, { "epoch": 2.3452357849796237, "grad_norm": 2.147870816517021, "learning_rate": 1.0948147186709067e-05, "loss": 0.4453, "step": 24170 }, { "epoch": 2.346206093537745, "grad_norm": 1.8861077026920583, "learning_rate": 1.0931922902200013e-05, "loss": 0.4214, "step": 24180 }, { "epoch": 2.3471764020958665, "grad_norm": 1.9335635244208784, "learning_rate": 1.091569861769096e-05, "loss": 0.4053, "step": 24190 }, { "epoch": 2.348146710653988, "grad_norm": 2.1792571618590117, "learning_rate": 1.0899474333181907e-05, "loss": 0.4502, "step": 24200 }, { "epoch": 2.3491170192121094, "grad_norm": 1.8035101955175616, "learning_rate": 1.0883250048672854e-05, "loss": 0.4777, "step": 24210 }, { "epoch": 2.3500873277702308, "grad_norm": 2.3748193703673506, "learning_rate": 1.08670257641638e-05, "loss": 0.4348, "step": 24220 }, { "epoch": 2.3510576363283526, "grad_norm": 2.093876681748244, "learning_rate": 1.0850801479654747e-05, "loss": 0.4801, "step": 24230 }, { "epoch": 2.352027944886474, "grad_norm": 2.0917507142883025, "learning_rate": 1.0834577195145695e-05, "loss": 0.4622, "step": 24240 }, { "epoch": 2.3529982534445955, "grad_norm": 2.4449607859132816, "learning_rate": 1.0818352910636643e-05, "loss": 0.462, "step": 24250 }, { "epoch": 2.353968562002717, "grad_norm": 2.1889912797955877, "learning_rate": 1.080212862612759e-05, "loss": 0.4614, "step": 24260 }, { "epoch": 2.3549388705608383, "grad_norm": 1.7339445133001758, "learning_rate": 1.0785904341618536e-05, "loss": 0.4477, "step": 24270 }, { "epoch": 2.3559091791189597, "grad_norm": 2.2974387900444064, "learning_rate": 1.0769680057109482e-05, "loss": 0.488, "step": 24280 }, { "epoch": 2.356879487677081, "grad_norm": 1.9216704451577764, "learning_rate": 1.075345577260043e-05, "loss": 0.4602, "step": 24290 }, { "epoch": 2.357849796235203, "grad_norm": 1.595623484078005, "learning_rate": 1.0737231488091376e-05, "loss": 0.4618, "step": 24300 }, { "epoch": 2.3588201047933244, "grad_norm": 2.1389381123821973, "learning_rate": 1.0721007203582323e-05, "loss": 0.4109, "step": 24310 }, { "epoch": 2.359790413351446, "grad_norm": 2.1703895980886294, "learning_rate": 1.070478291907327e-05, "loss": 0.4755, "step": 24320 }, { "epoch": 2.360760721909567, "grad_norm": 2.101496536635865, "learning_rate": 1.0688558634564216e-05, "loss": 0.4371, "step": 24330 }, { "epoch": 2.3617310304676886, "grad_norm": 2.201243439271482, "learning_rate": 1.0672334350055164e-05, "loss": 0.4252, "step": 24340 }, { "epoch": 2.36270133902581, "grad_norm": 2.067190247810472, "learning_rate": 1.065611006554611e-05, "loss": 0.4684, "step": 24350 }, { "epoch": 2.3636716475839314, "grad_norm": 1.593808072912525, "learning_rate": 1.0639885781037056e-05, "loss": 0.4474, "step": 24360 }, { "epoch": 2.3646419561420533, "grad_norm": 2.3672169952188344, "learning_rate": 1.0623661496528003e-05, "loss": 0.4543, "step": 24370 }, { "epoch": 2.3656122647001747, "grad_norm": 2.173568364373994, "learning_rate": 1.060743721201895e-05, "loss": 0.4172, "step": 24380 }, { "epoch": 2.366582573258296, "grad_norm": 2.0059231401496005, "learning_rate": 1.0591212927509897e-05, "loss": 0.4735, "step": 24390 }, { "epoch": 2.3675528818164175, "grad_norm": 2.0851213756560187, "learning_rate": 1.0574988643000844e-05, "loss": 0.4363, "step": 24400 }, { "epoch": 2.368523190374539, "grad_norm": 2.05634160028851, "learning_rate": 1.055876435849179e-05, "loss": 0.5026, "step": 24410 }, { "epoch": 2.369493498932661, "grad_norm": 2.2318992281649583, "learning_rate": 1.0542540073982738e-05, "loss": 0.4272, "step": 24420 }, { "epoch": 2.3704638074907822, "grad_norm": 2.3837912348792862, "learning_rate": 1.0526315789473684e-05, "loss": 0.4873, "step": 24430 }, { "epoch": 2.3714341160489036, "grad_norm": 1.9839906718631033, "learning_rate": 1.051009150496463e-05, "loss": 0.4689, "step": 24440 }, { "epoch": 2.372404424607025, "grad_norm": 2.9092572102641374, "learning_rate": 1.0493867220455579e-05, "loss": 0.455, "step": 24450 }, { "epoch": 2.3733747331651465, "grad_norm": 1.783405495522693, "learning_rate": 1.0477642935946525e-05, "loss": 0.47, "step": 24460 }, { "epoch": 2.374345041723268, "grad_norm": 2.4188709581674, "learning_rate": 1.0461418651437473e-05, "loss": 0.4488, "step": 24470 }, { "epoch": 2.3753153502813893, "grad_norm": 2.014782666224647, "learning_rate": 1.044519436692842e-05, "loss": 0.4533, "step": 24480 }, { "epoch": 2.376285658839511, "grad_norm": 2.174178898015034, "learning_rate": 1.0428970082419366e-05, "loss": 0.444, "step": 24490 }, { "epoch": 2.3772559673976326, "grad_norm": 2.485611502054494, "learning_rate": 1.0412745797910313e-05, "loss": 0.5027, "step": 24500 }, { "epoch": 2.3772559673976326, "eval_loss": 0.6276779174804688, "eval_runtime": 3079.5235, "eval_samples_per_second": 0.582, "eval_steps_per_second": 0.291, "step": 24500 }, { "epoch": 2.378226275955754, "grad_norm": 2.5871995720796184, "learning_rate": 1.0396521513401259e-05, "loss": 0.4382, "step": 24510 }, { "epoch": 2.3791965845138754, "grad_norm": 1.536190288984018, "learning_rate": 1.0380297228892207e-05, "loss": 0.4473, "step": 24520 }, { "epoch": 2.380166893071997, "grad_norm": 1.6459160347807742, "learning_rate": 1.0364072944383153e-05, "loss": 0.4078, "step": 24530 }, { "epoch": 2.381137201630118, "grad_norm": 2.152035121079775, "learning_rate": 1.03478486598741e-05, "loss": 0.4447, "step": 24540 }, { "epoch": 2.3821075101882396, "grad_norm": 2.369193332564519, "learning_rate": 1.0331624375365046e-05, "loss": 0.3994, "step": 24550 }, { "epoch": 2.3830778187463615, "grad_norm": 2.1514780345815256, "learning_rate": 1.0315400090855994e-05, "loss": 0.496, "step": 24560 }, { "epoch": 2.384048127304483, "grad_norm": 1.7911554347140786, "learning_rate": 1.029917580634694e-05, "loss": 0.4701, "step": 24570 }, { "epoch": 2.3850184358626043, "grad_norm": 2.0153664778745854, "learning_rate": 1.0282951521837887e-05, "loss": 0.4219, "step": 24580 }, { "epoch": 2.3859887444207257, "grad_norm": 1.80318954794296, "learning_rate": 1.0266727237328833e-05, "loss": 0.4683, "step": 24590 }, { "epoch": 2.386959052978847, "grad_norm": 2.588414199711988, "learning_rate": 1.0250502952819781e-05, "loss": 0.5015, "step": 24600 }, { "epoch": 2.387929361536969, "grad_norm": 2.4217752650788613, "learning_rate": 1.0234278668310728e-05, "loss": 0.4975, "step": 24610 }, { "epoch": 2.3888996700950904, "grad_norm": 1.794705542002023, "learning_rate": 1.0218054383801674e-05, "loss": 0.4836, "step": 24620 }, { "epoch": 2.389869978653212, "grad_norm": 1.7631883574466254, "learning_rate": 1.020183009929262e-05, "loss": 0.5004, "step": 24630 }, { "epoch": 2.3908402872113332, "grad_norm": 2.510901529741251, "learning_rate": 1.0185605814783569e-05, "loss": 0.4626, "step": 24640 }, { "epoch": 2.3918105957694547, "grad_norm": 2.0262233819009157, "learning_rate": 1.0169381530274517e-05, "loss": 0.4289, "step": 24650 }, { "epoch": 2.392780904327576, "grad_norm": 2.039691163120118, "learning_rate": 1.0153157245765463e-05, "loss": 0.4712, "step": 24660 }, { "epoch": 2.3937512128856975, "grad_norm": 2.3530360716419665, "learning_rate": 1.013693296125641e-05, "loss": 0.4661, "step": 24670 }, { "epoch": 2.3947215214438193, "grad_norm": 1.8731826227321207, "learning_rate": 1.0120708676747356e-05, "loss": 0.4602, "step": 24680 }, { "epoch": 2.3956918300019407, "grad_norm": 2.1068678736668067, "learning_rate": 1.0104484392238304e-05, "loss": 0.3962, "step": 24690 }, { "epoch": 2.396662138560062, "grad_norm": 2.0442268653871443, "learning_rate": 1.008826010772925e-05, "loss": 0.4241, "step": 24700 }, { "epoch": 2.3976324471181836, "grad_norm": 1.823987854926844, "learning_rate": 1.0072035823220197e-05, "loss": 0.4332, "step": 24710 }, { "epoch": 2.398602755676305, "grad_norm": 2.0466476195429433, "learning_rate": 1.0055811538711143e-05, "loss": 0.4973, "step": 24720 }, { "epoch": 2.3995730642344264, "grad_norm": 2.2091928863349706, "learning_rate": 1.003958725420209e-05, "loss": 0.4561, "step": 24730 }, { "epoch": 2.400543372792548, "grad_norm": 2.0728809538988417, "learning_rate": 1.0023362969693038e-05, "loss": 0.4311, "step": 24740 }, { "epoch": 2.4015136813506697, "grad_norm": 2.5482352204418612, "learning_rate": 1.0007138685183984e-05, "loss": 0.5071, "step": 24750 }, { "epoch": 2.402483989908791, "grad_norm": 1.9610940178733647, "learning_rate": 9.99091440067493e-06, "loss": 0.4401, "step": 24760 }, { "epoch": 2.4034542984669125, "grad_norm": 2.3445676431912448, "learning_rate": 9.974690116165877e-06, "loss": 0.4482, "step": 24770 }, { "epoch": 2.404424607025034, "grad_norm": 1.6828772581664517, "learning_rate": 9.958465831656825e-06, "loss": 0.4489, "step": 24780 }, { "epoch": 2.4053949155831553, "grad_norm": 2.126865830634638, "learning_rate": 9.942241547147771e-06, "loss": 0.4136, "step": 24790 }, { "epoch": 2.4063652241412767, "grad_norm": 1.6333253702974164, "learning_rate": 9.926017262638718e-06, "loss": 0.4479, "step": 24800 }, { "epoch": 2.4073355326993986, "grad_norm": 2.05443116143666, "learning_rate": 9.909792978129664e-06, "loss": 0.4806, "step": 24810 }, { "epoch": 2.40830584125752, "grad_norm": 2.1173326880866234, "learning_rate": 9.893568693620612e-06, "loss": 0.496, "step": 24820 }, { "epoch": 2.4092761498156414, "grad_norm": 2.6271672766867185, "learning_rate": 9.877344409111558e-06, "loss": 0.4953, "step": 24830 }, { "epoch": 2.410246458373763, "grad_norm": 1.7740927010946907, "learning_rate": 9.861120124602506e-06, "loss": 0.445, "step": 24840 }, { "epoch": 2.4112167669318842, "grad_norm": 1.7938164516772293, "learning_rate": 9.844895840093453e-06, "loss": 0.4651, "step": 24850 }, { "epoch": 2.4121870754900057, "grad_norm": 1.876181751989744, "learning_rate": 9.8286715555844e-06, "loss": 0.4742, "step": 24860 }, { "epoch": 2.4131573840481275, "grad_norm": 2.268711698650067, "learning_rate": 9.812447271075347e-06, "loss": 0.4858, "step": 24870 }, { "epoch": 2.414127692606249, "grad_norm": 2.002023556609326, "learning_rate": 9.796222986566294e-06, "loss": 0.437, "step": 24880 }, { "epoch": 2.4150980011643703, "grad_norm": 2.2377584218251787, "learning_rate": 9.77999870205724e-06, "loss": 0.4631, "step": 24890 }, { "epoch": 2.4160683097224918, "grad_norm": 2.048732665820942, "learning_rate": 9.763774417548186e-06, "loss": 0.4828, "step": 24900 }, { "epoch": 2.417038618280613, "grad_norm": 1.6448348280754015, "learning_rate": 9.747550133039133e-06, "loss": 0.5178, "step": 24910 }, { "epoch": 2.4180089268387346, "grad_norm": 1.5590200283867917, "learning_rate": 9.731325848530081e-06, "loss": 0.4582, "step": 24920 }, { "epoch": 2.418979235396856, "grad_norm": 1.9618039375978873, "learning_rate": 9.715101564021027e-06, "loss": 0.4207, "step": 24930 }, { "epoch": 2.419949543954978, "grad_norm": 2.067363699561904, "learning_rate": 9.698877279511974e-06, "loss": 0.4825, "step": 24940 }, { "epoch": 2.4209198525130993, "grad_norm": 2.1824993479688635, "learning_rate": 9.68265299500292e-06, "loss": 0.4507, "step": 24950 }, { "epoch": 2.4218901610712207, "grad_norm": 1.9138947437636256, "learning_rate": 9.666428710493868e-06, "loss": 0.4576, "step": 24960 }, { "epoch": 2.422860469629342, "grad_norm": 1.6115918485232108, "learning_rate": 9.650204425984815e-06, "loss": 0.4705, "step": 24970 }, { "epoch": 2.4238307781874635, "grad_norm": 2.250793958728259, "learning_rate": 9.633980141475761e-06, "loss": 0.4521, "step": 24980 }, { "epoch": 2.424801086745585, "grad_norm": 1.8523875454672862, "learning_rate": 9.617755856966707e-06, "loss": 0.4397, "step": 24990 }, { "epoch": 2.4257713953037063, "grad_norm": 1.904730107613101, "learning_rate": 9.601531572457655e-06, "loss": 0.4564, "step": 25000 }, { "epoch": 2.4257713953037063, "eval_loss": 0.6268747448921204, "eval_runtime": 3077.7529, "eval_samples_per_second": 0.582, "eval_steps_per_second": 0.291, "step": 25000 }, { "epoch": 2.426741703861828, "grad_norm": 2.161806358765534, "learning_rate": 9.585307287948602e-06, "loss": 0.4854, "step": 25010 }, { "epoch": 2.4277120124199496, "grad_norm": 2.3367238235277483, "learning_rate": 9.569083003439548e-06, "loss": 0.4657, "step": 25020 }, { "epoch": 2.428682320978071, "grad_norm": 1.7620434830381735, "learning_rate": 9.552858718930495e-06, "loss": 0.4716, "step": 25030 }, { "epoch": 2.4296526295361924, "grad_norm": 1.9332408663776153, "learning_rate": 9.536634434421443e-06, "loss": 0.4321, "step": 25040 }, { "epoch": 2.430622938094314, "grad_norm": 2.0611278551192465, "learning_rate": 9.52041014991239e-06, "loss": 0.4677, "step": 25050 }, { "epoch": 2.4315932466524357, "grad_norm": 1.6768258233531348, "learning_rate": 9.504185865403337e-06, "loss": 0.462, "step": 25060 }, { "epoch": 2.432563555210557, "grad_norm": 2.3845648710174285, "learning_rate": 9.487961580894283e-06, "loss": 0.4557, "step": 25070 }, { "epoch": 2.4335338637686785, "grad_norm": 2.1104954575429047, "learning_rate": 9.47173729638523e-06, "loss": 0.4731, "step": 25080 }, { "epoch": 2.4345041723268, "grad_norm": 2.0800318180116446, "learning_rate": 9.455513011876176e-06, "loss": 0.5289, "step": 25090 }, { "epoch": 2.4354744808849214, "grad_norm": 2.237535485463924, "learning_rate": 9.439288727367124e-06, "loss": 0.456, "step": 25100 }, { "epoch": 2.4364447894430428, "grad_norm": 2.160116067888104, "learning_rate": 9.42306444285807e-06, "loss": 0.4984, "step": 25110 }, { "epoch": 2.437415098001164, "grad_norm": 2.0888945069063336, "learning_rate": 9.406840158349017e-06, "loss": 0.4893, "step": 25120 }, { "epoch": 2.438385406559286, "grad_norm": 1.997111795963389, "learning_rate": 9.390615873839963e-06, "loss": 0.4434, "step": 25130 }, { "epoch": 2.4393557151174075, "grad_norm": 1.3102614971883442, "learning_rate": 9.374391589330911e-06, "loss": 0.4015, "step": 25140 }, { "epoch": 2.440326023675529, "grad_norm": 1.972554181589825, "learning_rate": 9.358167304821858e-06, "loss": 0.4408, "step": 25150 }, { "epoch": 2.4412963322336503, "grad_norm": 1.6989988042772415, "learning_rate": 9.341943020312804e-06, "loss": 0.435, "step": 25160 }, { "epoch": 2.4422666407917717, "grad_norm": 2.124313923821145, "learning_rate": 9.32571873580375e-06, "loss": 0.4387, "step": 25170 }, { "epoch": 2.443236949349893, "grad_norm": 2.19131174810045, "learning_rate": 9.309494451294699e-06, "loss": 0.4672, "step": 25180 }, { "epoch": 2.4442072579080145, "grad_norm": 1.962932141350282, "learning_rate": 9.293270166785645e-06, "loss": 0.4983, "step": 25190 }, { "epoch": 2.4451775664661364, "grad_norm": 1.9122705512506952, "learning_rate": 9.277045882276591e-06, "loss": 0.4272, "step": 25200 }, { "epoch": 2.446147875024258, "grad_norm": 2.3053425047807736, "learning_rate": 9.260821597767538e-06, "loss": 0.4199, "step": 25210 }, { "epoch": 2.447118183582379, "grad_norm": 1.7446501480241867, "learning_rate": 9.244597313258486e-06, "loss": 0.4457, "step": 25220 }, { "epoch": 2.4480884921405006, "grad_norm": 2.1569409270025774, "learning_rate": 9.228373028749434e-06, "loss": 0.4419, "step": 25230 }, { "epoch": 2.449058800698622, "grad_norm": 2.0436766020436394, "learning_rate": 9.21214874424038e-06, "loss": 0.494, "step": 25240 }, { "epoch": 2.450029109256744, "grad_norm": 1.7972560660936232, "learning_rate": 9.195924459731327e-06, "loss": 0.45, "step": 25250 }, { "epoch": 2.4509994178148653, "grad_norm": 2.323226898620802, "learning_rate": 9.179700175222273e-06, "loss": 0.5016, "step": 25260 }, { "epoch": 2.4519697263729867, "grad_norm": 2.0725138170956408, "learning_rate": 9.16347589071322e-06, "loss": 0.4894, "step": 25270 }, { "epoch": 2.452940034931108, "grad_norm": 1.9222900586962415, "learning_rate": 9.147251606204168e-06, "loss": 0.5101, "step": 25280 }, { "epoch": 2.4539103434892295, "grad_norm": 2.0067752335911684, "learning_rate": 9.131027321695114e-06, "loss": 0.4418, "step": 25290 }, { "epoch": 2.454880652047351, "grad_norm": 2.1844091982849587, "learning_rate": 9.11480303718606e-06, "loss": 0.4812, "step": 25300 }, { "epoch": 2.4558509606054724, "grad_norm": 2.1484177813886487, "learning_rate": 9.098578752677007e-06, "loss": 0.4516, "step": 25310 }, { "epoch": 2.4568212691635942, "grad_norm": 2.070642163273252, "learning_rate": 9.082354468167955e-06, "loss": 0.402, "step": 25320 }, { "epoch": 2.4577915777217156, "grad_norm": 1.8576636206526207, "learning_rate": 9.066130183658901e-06, "loss": 0.4864, "step": 25330 }, { "epoch": 2.458761886279837, "grad_norm": 1.9269579474919305, "learning_rate": 9.049905899149848e-06, "loss": 0.4647, "step": 25340 }, { "epoch": 2.4597321948379585, "grad_norm": 1.79111642345789, "learning_rate": 9.033681614640794e-06, "loss": 0.4454, "step": 25350 }, { "epoch": 2.46070250339608, "grad_norm": 1.788304393616984, "learning_rate": 9.017457330131742e-06, "loss": 0.4422, "step": 25360 }, { "epoch": 2.4616728119542013, "grad_norm": 2.2543923009140734, "learning_rate": 9.001233045622688e-06, "loss": 0.4413, "step": 25370 }, { "epoch": 2.4626431205123227, "grad_norm": 1.1384131071115926, "learning_rate": 8.985008761113635e-06, "loss": 0.3985, "step": 25380 }, { "epoch": 2.4636134290704446, "grad_norm": 2.105028387028187, "learning_rate": 8.968784476604581e-06, "loss": 0.4467, "step": 25390 }, { "epoch": 2.464583737628566, "grad_norm": 1.925526835319373, "learning_rate": 8.95256019209553e-06, "loss": 0.4567, "step": 25400 }, { "epoch": 2.4655540461866874, "grad_norm": 2.229525401490896, "learning_rate": 8.936335907586476e-06, "loss": 0.4889, "step": 25410 }, { "epoch": 2.466524354744809, "grad_norm": 1.9389731054706656, "learning_rate": 8.920111623077422e-06, "loss": 0.3961, "step": 25420 }, { "epoch": 2.46749466330293, "grad_norm": 2.0732327430253985, "learning_rate": 8.90388733856837e-06, "loss": 0.4688, "step": 25430 }, { "epoch": 2.4684649718610516, "grad_norm": 2.0697499575240665, "learning_rate": 8.887663054059317e-06, "loss": 0.4383, "step": 25440 }, { "epoch": 2.4694352804191735, "grad_norm": 1.7921240641401464, "learning_rate": 8.871438769550265e-06, "loss": 0.4513, "step": 25450 }, { "epoch": 2.470405588977295, "grad_norm": 1.3473761062363179, "learning_rate": 8.855214485041211e-06, "loss": 0.4384, "step": 25460 }, { "epoch": 2.4713758975354163, "grad_norm": 1.7010157532863919, "learning_rate": 8.838990200532157e-06, "loss": 0.4791, "step": 25470 }, { "epoch": 2.4723462060935377, "grad_norm": 1.3896979150167772, "learning_rate": 8.822765916023104e-06, "loss": 0.4421, "step": 25480 }, { "epoch": 2.473316514651659, "grad_norm": 1.8009616913979432, "learning_rate": 8.80654163151405e-06, "loss": 0.4873, "step": 25490 }, { "epoch": 2.4742868232097806, "grad_norm": 2.2299861589049885, "learning_rate": 8.790317347004998e-06, "loss": 0.4032, "step": 25500 }, { "epoch": 2.4742868232097806, "eval_loss": 0.6272784471511841, "eval_runtime": 3078.0792, "eval_samples_per_second": 0.582, "eval_steps_per_second": 0.291, "step": 25500 }, { "epoch": 2.4752571317679024, "grad_norm": 1.8649860804881975, "learning_rate": 8.774093062495945e-06, "loss": 0.4212, "step": 25510 }, { "epoch": 2.476227440326024, "grad_norm": 2.193301904479279, "learning_rate": 8.757868777986891e-06, "loss": 0.4252, "step": 25520 }, { "epoch": 2.4771977488841452, "grad_norm": 2.336719368813014, "learning_rate": 8.741644493477837e-06, "loss": 0.4291, "step": 25530 }, { "epoch": 2.4781680574422666, "grad_norm": 2.17152920020732, "learning_rate": 8.725420208968785e-06, "loss": 0.4321, "step": 25540 }, { "epoch": 2.479138366000388, "grad_norm": 2.164652196668536, "learning_rate": 8.709195924459732e-06, "loss": 0.4638, "step": 25550 }, { "epoch": 2.4801086745585095, "grad_norm": 2.345312570223112, "learning_rate": 8.692971639950678e-06, "loss": 0.4248, "step": 25560 }, { "epoch": 2.481078983116631, "grad_norm": 2.1770260990943373, "learning_rate": 8.676747355441625e-06, "loss": 0.4596, "step": 25570 }, { "epoch": 2.4820492916747527, "grad_norm": 2.010469969137573, "learning_rate": 8.660523070932573e-06, "loss": 0.4179, "step": 25580 }, { "epoch": 2.483019600232874, "grad_norm": 2.460621539699081, "learning_rate": 8.644298786423519e-06, "loss": 0.4708, "step": 25590 }, { "epoch": 2.4839899087909956, "grad_norm": 1.9419411686401578, "learning_rate": 8.628074501914465e-06, "loss": 0.5145, "step": 25600 }, { "epoch": 2.484960217349117, "grad_norm": 1.8628106961622968, "learning_rate": 8.611850217405412e-06, "loss": 0.4775, "step": 25610 }, { "epoch": 2.4859305259072384, "grad_norm": 3.1904713012817307, "learning_rate": 8.59562593289636e-06, "loss": 0.4751, "step": 25620 }, { "epoch": 2.48690083446536, "grad_norm": 2.1228744229790797, "learning_rate": 8.579401648387308e-06, "loss": 0.4267, "step": 25630 }, { "epoch": 2.4878711430234812, "grad_norm": 2.111665695733381, "learning_rate": 8.563177363878254e-06, "loss": 0.4559, "step": 25640 }, { "epoch": 2.488841451581603, "grad_norm": 1.851520695502125, "learning_rate": 8.5469530793692e-06, "loss": 0.4314, "step": 25650 }, { "epoch": 2.4898117601397245, "grad_norm": 2.0235108403859456, "learning_rate": 8.530728794860147e-06, "loss": 0.4666, "step": 25660 }, { "epoch": 2.490782068697846, "grad_norm": 2.207796603692839, "learning_rate": 8.514504510351093e-06, "loss": 0.4109, "step": 25670 }, { "epoch": 2.4917523772559673, "grad_norm": 2.231394855430463, "learning_rate": 8.498280225842042e-06, "loss": 0.5076, "step": 25680 }, { "epoch": 2.4927226858140887, "grad_norm": 2.1687837431881203, "learning_rate": 8.482055941332988e-06, "loss": 0.4387, "step": 25690 }, { "epoch": 2.4936929943722106, "grad_norm": 2.44016617426192, "learning_rate": 8.465831656823934e-06, "loss": 0.4321, "step": 25700 }, { "epoch": 2.494663302930332, "grad_norm": 1.7058498203781634, "learning_rate": 8.44960737231488e-06, "loss": 0.4921, "step": 25710 }, { "epoch": 2.4956336114884534, "grad_norm": 1.9042135604418329, "learning_rate": 8.433383087805829e-06, "loss": 0.4041, "step": 25720 }, { "epoch": 2.496603920046575, "grad_norm": 1.8436463298626156, "learning_rate": 8.417158803296775e-06, "loss": 0.49, "step": 25730 }, { "epoch": 2.4975742286046962, "grad_norm": 2.2019048088471327, "learning_rate": 8.400934518787722e-06, "loss": 0.4691, "step": 25740 }, { "epoch": 2.4985445371628177, "grad_norm": 2.0782350378861665, "learning_rate": 8.384710234278668e-06, "loss": 0.4508, "step": 25750 }, { "epoch": 2.499514845720939, "grad_norm": 2.145442299189053, "learning_rate": 8.368485949769616e-06, "loss": 0.4743, "step": 25760 }, { "epoch": 2.500485154279061, "grad_norm": 2.316566397386294, "learning_rate": 8.352261665260562e-06, "loss": 0.4592, "step": 25770 }, { "epoch": 2.5014554628371823, "grad_norm": 1.6346051404759172, "learning_rate": 8.336037380751509e-06, "loss": 0.4035, "step": 25780 }, { "epoch": 2.5024257713953038, "grad_norm": 2.0841388033352883, "learning_rate": 8.319813096242455e-06, "loss": 0.4486, "step": 25790 }, { "epoch": 2.503396079953425, "grad_norm": 1.9573845628507625, "learning_rate": 8.303588811733402e-06, "loss": 0.582, "step": 25800 }, { "epoch": 2.5043663885115466, "grad_norm": 2.598464150785621, "learning_rate": 8.28736452722435e-06, "loss": 0.4414, "step": 25810 }, { "epoch": 2.505336697069668, "grad_norm": 2.3038977683814896, "learning_rate": 8.271140242715298e-06, "loss": 0.4723, "step": 25820 }, { "epoch": 2.5063070056277894, "grad_norm": 2.020217994153988, "learning_rate": 8.254915958206244e-06, "loss": 0.4526, "step": 25830 }, { "epoch": 2.5072773141859113, "grad_norm": 1.8235299819150055, "learning_rate": 8.23869167369719e-06, "loss": 0.459, "step": 25840 }, { "epoch": 2.5082476227440327, "grad_norm": 2.673806204400313, "learning_rate": 8.222467389188137e-06, "loss": 0.4492, "step": 25850 }, { "epoch": 2.509217931302154, "grad_norm": 2.0555568396434234, "learning_rate": 8.206243104679085e-06, "loss": 0.4484, "step": 25860 }, { "epoch": 2.5101882398602755, "grad_norm": 1.9255447316677266, "learning_rate": 8.190018820170031e-06, "loss": 0.4948, "step": 25870 }, { "epoch": 2.511158548418397, "grad_norm": 1.9605963182738635, "learning_rate": 8.173794535660978e-06, "loss": 0.4114, "step": 25880 }, { "epoch": 2.5121288569765188, "grad_norm": 2.01480662215233, "learning_rate": 8.157570251151924e-06, "loss": 0.4979, "step": 25890 }, { "epoch": 2.5130991655346397, "grad_norm": 2.76350538826215, "learning_rate": 8.141345966642872e-06, "loss": 0.5307, "step": 25900 }, { "epoch": 2.5140694740927616, "grad_norm": 2.1438183238992634, "learning_rate": 8.125121682133819e-06, "loss": 0.4645, "step": 25910 }, { "epoch": 2.515039782650883, "grad_norm": 2.618047396021202, "learning_rate": 8.108897397624765e-06, "loss": 0.4656, "step": 25920 }, { "epoch": 2.5160100912090044, "grad_norm": 1.768467222286068, "learning_rate": 8.092673113115711e-06, "loss": 0.4489, "step": 25930 }, { "epoch": 2.516980399767126, "grad_norm": 2.114713049707471, "learning_rate": 8.07644882860666e-06, "loss": 0.4505, "step": 25940 }, { "epoch": 2.5179507083252473, "grad_norm": 1.8424804225977651, "learning_rate": 8.060224544097606e-06, "loss": 0.4206, "step": 25950 }, { "epoch": 2.518921016883369, "grad_norm": 2.131011626241048, "learning_rate": 8.044000259588552e-06, "loss": 0.4826, "step": 25960 }, { "epoch": 2.5198913254414905, "grad_norm": 1.8234473031937264, "learning_rate": 8.027775975079499e-06, "loss": 0.4209, "step": 25970 }, { "epoch": 2.520861633999612, "grad_norm": 1.8249503573593846, "learning_rate": 8.011551690570447e-06, "loss": 0.4717, "step": 25980 }, { "epoch": 2.5218319425577334, "grad_norm": 2.262448485653634, "learning_rate": 7.995327406061393e-06, "loss": 0.4909, "step": 25990 }, { "epoch": 2.5228022511158548, "grad_norm": 1.995029631522796, "learning_rate": 7.97910312155234e-06, "loss": 0.4714, "step": 26000 }, { "epoch": 2.5228022511158548, "eval_loss": 0.626392662525177, "eval_runtime": 3078.3271, "eval_samples_per_second": 0.582, "eval_steps_per_second": 0.291, "step": 26000 }, { "epoch": 2.523772559673976, "grad_norm": 2.471379884050611, "learning_rate": 7.962878837043287e-06, "loss": 0.4437, "step": 26010 }, { "epoch": 2.5247428682320976, "grad_norm": 2.3011841532666084, "learning_rate": 7.946654552534234e-06, "loss": 0.4811, "step": 26020 }, { "epoch": 2.5257131767902194, "grad_norm": 2.328615095608579, "learning_rate": 7.93043026802518e-06, "loss": 0.4926, "step": 26030 }, { "epoch": 2.526683485348341, "grad_norm": 2.008542874656307, "learning_rate": 7.914205983516128e-06, "loss": 0.3919, "step": 26040 }, { "epoch": 2.5276537939064623, "grad_norm": 2.3976389886549025, "learning_rate": 7.897981699007075e-06, "loss": 0.4444, "step": 26050 }, { "epoch": 2.5286241024645837, "grad_norm": 1.8427215260803036, "learning_rate": 7.881757414498021e-06, "loss": 0.4142, "step": 26060 }, { "epoch": 2.529594411022705, "grad_norm": 2.4210404140795734, "learning_rate": 7.865533129988967e-06, "loss": 0.5065, "step": 26070 }, { "epoch": 2.530564719580827, "grad_norm": 2.6149452420905237, "learning_rate": 7.849308845479915e-06, "loss": 0.5364, "step": 26080 }, { "epoch": 2.531535028138948, "grad_norm": 1.250128383435868, "learning_rate": 7.833084560970862e-06, "loss": 0.474, "step": 26090 }, { "epoch": 2.53250533669707, "grad_norm": 2.2654863485991514, "learning_rate": 7.816860276461808e-06, "loss": 0.4398, "step": 26100 }, { "epoch": 2.533475645255191, "grad_norm": 1.6321523455990505, "learning_rate": 7.800635991952755e-06, "loss": 0.4603, "step": 26110 }, { "epoch": 2.5344459538133126, "grad_norm": 2.3633081540698697, "learning_rate": 7.784411707443703e-06, "loss": 0.4404, "step": 26120 }, { "epoch": 2.535416262371434, "grad_norm": 2.077354268429289, "learning_rate": 7.768187422934649e-06, "loss": 0.5125, "step": 26130 }, { "epoch": 2.5363865709295554, "grad_norm": 1.980465618606371, "learning_rate": 7.751963138425595e-06, "loss": 0.4, "step": 26140 }, { "epoch": 2.5373568794876773, "grad_norm": 2.1634065659364676, "learning_rate": 7.735738853916542e-06, "loss": 0.4632, "step": 26150 }, { "epoch": 2.5383271880457987, "grad_norm": 2.3670777081680643, "learning_rate": 7.71951456940749e-06, "loss": 0.4982, "step": 26160 }, { "epoch": 2.53929749660392, "grad_norm": 2.1219544634411207, "learning_rate": 7.703290284898436e-06, "loss": 0.4946, "step": 26170 }, { "epoch": 2.5402678051620415, "grad_norm": 2.295772616548578, "learning_rate": 7.687066000389383e-06, "loss": 0.4463, "step": 26180 }, { "epoch": 2.541238113720163, "grad_norm": 2.190516448432997, "learning_rate": 7.670841715880329e-06, "loss": 0.4762, "step": 26190 }, { "epoch": 2.5422084222782844, "grad_norm": 1.540362328239341, "learning_rate": 7.654617431371275e-06, "loss": 0.4457, "step": 26200 }, { "epoch": 2.5431787308364058, "grad_norm": 2.107515111823764, "learning_rate": 7.638393146862225e-06, "loss": 0.4443, "step": 26210 }, { "epoch": 2.5441490393945276, "grad_norm": 2.174561070960879, "learning_rate": 7.622168862353171e-06, "loss": 0.4504, "step": 26220 }, { "epoch": 2.545119347952649, "grad_norm": 1.9086610783713582, "learning_rate": 7.605944577844118e-06, "loss": 0.4352, "step": 26230 }, { "epoch": 2.5460896565107705, "grad_norm": 2.3919380566616653, "learning_rate": 7.589720293335064e-06, "loss": 0.4607, "step": 26240 }, { "epoch": 2.547059965068892, "grad_norm": 2.1753339423630056, "learning_rate": 7.573496008826012e-06, "loss": 0.4774, "step": 26250 }, { "epoch": 2.5480302736270133, "grad_norm": 1.7085273957705185, "learning_rate": 7.557271724316958e-06, "loss": 0.5047, "step": 26260 }, { "epoch": 2.549000582185135, "grad_norm": 1.9448755177016561, "learning_rate": 7.541047439807905e-06, "loss": 0.4486, "step": 26270 }, { "epoch": 2.549970890743256, "grad_norm": 2.2690590512497217, "learning_rate": 7.524823155298852e-06, "loss": 0.4762, "step": 26280 }, { "epoch": 2.550941199301378, "grad_norm": 1.6386034921733572, "learning_rate": 7.508598870789799e-06, "loss": 0.4697, "step": 26290 }, { "epoch": 2.5519115078594994, "grad_norm": 2.6752559812079597, "learning_rate": 7.492374586280745e-06, "loss": 0.4696, "step": 26300 }, { "epoch": 2.552881816417621, "grad_norm": 2.4320177836419394, "learning_rate": 7.4761503017716925e-06, "loss": 0.4498, "step": 26310 }, { "epoch": 2.553852124975742, "grad_norm": 2.3730930844509794, "learning_rate": 7.459926017262639e-06, "loss": 0.4567, "step": 26320 }, { "epoch": 2.5548224335338636, "grad_norm": 2.0794087640525194, "learning_rate": 7.443701732753586e-06, "loss": 0.4502, "step": 26330 }, { "epoch": 2.5557927420919855, "grad_norm": 2.0181401788364353, "learning_rate": 7.4274774482445325e-06, "loss": 0.553, "step": 26340 }, { "epoch": 2.556763050650107, "grad_norm": 2.004831479835484, "learning_rate": 7.41125316373548e-06, "loss": 0.4493, "step": 26350 }, { "epoch": 2.5577333592082283, "grad_norm": 2.391763802509909, "learning_rate": 7.395028879226426e-06, "loss": 0.4472, "step": 26360 }, { "epoch": 2.5587036677663497, "grad_norm": 2.1046379629994547, "learning_rate": 7.3788045947173724e-06, "loss": 0.4867, "step": 26370 }, { "epoch": 2.559673976324471, "grad_norm": 1.7439712274723973, "learning_rate": 7.36258031020832e-06, "loss": 0.4757, "step": 26380 }, { "epoch": 2.5606442848825925, "grad_norm": 2.675584030100802, "learning_rate": 7.346356025699266e-06, "loss": 0.442, "step": 26390 }, { "epoch": 2.561614593440714, "grad_norm": 2.747327409324538, "learning_rate": 7.330131741190213e-06, "loss": 0.3996, "step": 26400 }, { "epoch": 2.562584901998836, "grad_norm": 1.918371536565268, "learning_rate": 7.313907456681161e-06, "loss": 0.4295, "step": 26410 }, { "epoch": 2.5635552105569572, "grad_norm": 2.6905148418494464, "learning_rate": 7.297683172172108e-06, "loss": 0.4072, "step": 26420 }, { "epoch": 2.5645255191150786, "grad_norm": 2.3471372105618538, "learning_rate": 7.281458887663055e-06, "loss": 0.4877, "step": 26430 }, { "epoch": 2.5654958276732, "grad_norm": 2.3503064236531177, "learning_rate": 7.265234603154001e-06, "loss": 0.4478, "step": 26440 }, { "epoch": 2.5664661362313215, "grad_norm": 2.3207840197512306, "learning_rate": 7.249010318644949e-06, "loss": 0.5022, "step": 26450 }, { "epoch": 2.567436444789443, "grad_norm": 2.2844461066346304, "learning_rate": 7.232786034135895e-06, "loss": 0.4967, "step": 26460 }, { "epoch": 2.5684067533475643, "grad_norm": 2.105871826875459, "learning_rate": 7.216561749626842e-06, "loss": 0.4661, "step": 26470 }, { "epoch": 2.569377061905686, "grad_norm": 1.5619924197127195, "learning_rate": 7.200337465117789e-06, "loss": 0.4706, "step": 26480 }, { "epoch": 2.5703473704638076, "grad_norm": 2.403644705382442, "learning_rate": 7.184113180608736e-06, "loss": 0.4469, "step": 26490 }, { "epoch": 2.571317679021929, "grad_norm": 2.221300148016647, "learning_rate": 7.167888896099682e-06, "loss": 0.4487, "step": 26500 }, { "epoch": 2.571317679021929, "eval_loss": 0.6245245933532715, "eval_runtime": 3081.2065, "eval_samples_per_second": 0.582, "eval_steps_per_second": 0.291, "step": 26500 }, { "epoch": 2.5722879875800504, "grad_norm": 2.3890702890227025, "learning_rate": 7.1516646115906294e-06, "loss": 0.4809, "step": 26510 }, { "epoch": 2.573258296138172, "grad_norm": 1.8300717911067486, "learning_rate": 7.135440327081576e-06, "loss": 0.4813, "step": 26520 }, { "epoch": 2.5742286046962937, "grad_norm": 2.1701292735376083, "learning_rate": 7.119216042572523e-06, "loss": 0.448, "step": 26530 }, { "epoch": 2.5751989132544146, "grad_norm": 2.0970645260082916, "learning_rate": 7.102991758063469e-06, "loss": 0.4856, "step": 26540 }, { "epoch": 2.5761692218125365, "grad_norm": 2.286663117034141, "learning_rate": 7.086767473554417e-06, "loss": 0.4264, "step": 26550 }, { "epoch": 2.577139530370658, "grad_norm": 2.338250784130726, "learning_rate": 7.070543189045363e-06, "loss": 0.4964, "step": 26560 }, { "epoch": 2.5781098389287793, "grad_norm": 2.495668443409847, "learning_rate": 7.054318904536309e-06, "loss": 0.4145, "step": 26570 }, { "epoch": 2.5790801474869007, "grad_norm": 2.111942378758775, "learning_rate": 7.038094620027257e-06, "loss": 0.4734, "step": 26580 }, { "epoch": 2.580050456045022, "grad_norm": 2.2717279926914054, "learning_rate": 7.021870335518203e-06, "loss": 0.4629, "step": 26590 }, { "epoch": 2.581020764603144, "grad_norm": 1.6070510989925946, "learning_rate": 7.005646051009151e-06, "loss": 0.505, "step": 26600 }, { "epoch": 2.5819910731612654, "grad_norm": 1.6938672428162131, "learning_rate": 6.989421766500098e-06, "loss": 0.4077, "step": 26610 }, { "epoch": 2.582961381719387, "grad_norm": 1.964059789636889, "learning_rate": 6.973197481991045e-06, "loss": 0.4626, "step": 26620 }, { "epoch": 2.5839316902775082, "grad_norm": 1.5970817911343262, "learning_rate": 6.956973197481992e-06, "loss": 0.4555, "step": 26630 }, { "epoch": 2.5849019988356297, "grad_norm": 1.2648448060539477, "learning_rate": 6.940748912972938e-06, "loss": 0.4148, "step": 26640 }, { "epoch": 2.585872307393751, "grad_norm": 1.9959502430536304, "learning_rate": 6.9245246284638856e-06, "loss": 0.4279, "step": 26650 }, { "epoch": 2.5868426159518725, "grad_norm": 2.1952788522193107, "learning_rate": 6.908300343954832e-06, "loss": 0.4223, "step": 26660 }, { "epoch": 2.5878129245099943, "grad_norm": 1.7032019581340345, "learning_rate": 6.892076059445779e-06, "loss": 0.472, "step": 26670 }, { "epoch": 2.5887832330681158, "grad_norm": 2.066926648246943, "learning_rate": 6.8758517749367256e-06, "loss": 0.472, "step": 26680 }, { "epoch": 2.589753541626237, "grad_norm": 2.446811117128422, "learning_rate": 6.859627490427673e-06, "loss": 0.432, "step": 26690 }, { "epoch": 2.5907238501843586, "grad_norm": 2.8022327089785954, "learning_rate": 6.843403205918619e-06, "loss": 0.429, "step": 26700 }, { "epoch": 2.59169415874248, "grad_norm": 2.1728078918321043, "learning_rate": 6.827178921409566e-06, "loss": 0.4569, "step": 26710 }, { "epoch": 2.592664467300602, "grad_norm": 2.399123225602425, "learning_rate": 6.810954636900513e-06, "loss": 0.4916, "step": 26720 }, { "epoch": 2.593634775858723, "grad_norm": 1.8937138660322683, "learning_rate": 6.79473035239146e-06, "loss": 0.461, "step": 26730 }, { "epoch": 2.5946050844168447, "grad_norm": 2.2958947329084998, "learning_rate": 6.778506067882406e-06, "loss": 0.4696, "step": 26740 }, { "epoch": 2.595575392974966, "grad_norm": 1.6951591347255646, "learning_rate": 6.762281783373353e-06, "loss": 0.4459, "step": 26750 }, { "epoch": 2.5965457015330875, "grad_norm": 1.8661594180037824, "learning_rate": 6.7460574988643e-06, "loss": 0.4185, "step": 26760 }, { "epoch": 2.597516010091209, "grad_norm": 2.426809794084687, "learning_rate": 6.729833214355246e-06, "loss": 0.4725, "step": 26770 }, { "epoch": 2.5984863186493303, "grad_norm": 2.0754266308428417, "learning_rate": 6.713608929846194e-06, "loss": 0.4747, "step": 26780 }, { "epoch": 2.599456627207452, "grad_norm": 2.18005859216409, "learning_rate": 6.69738464533714e-06, "loss": 0.4206, "step": 26790 }, { "epoch": 2.6004269357655736, "grad_norm": 1.823840317489889, "learning_rate": 6.681160360828088e-06, "loss": 0.4436, "step": 26800 }, { "epoch": 2.601397244323695, "grad_norm": 2.0338375240259228, "learning_rate": 6.664936076319035e-06, "loss": 0.4845, "step": 26810 }, { "epoch": 2.6023675528818164, "grad_norm": 2.014705292962573, "learning_rate": 6.648711791809982e-06, "loss": 0.482, "step": 26820 }, { "epoch": 2.603337861439938, "grad_norm": 1.9428925353974025, "learning_rate": 6.632487507300929e-06, "loss": 0.4581, "step": 26830 }, { "epoch": 2.6043081699980593, "grad_norm": 2.2064825726181185, "learning_rate": 6.616263222791875e-06, "loss": 0.4886, "step": 26840 }, { "epoch": 2.6052784785561807, "grad_norm": 1.6187717762768667, "learning_rate": 6.6000389382828225e-06, "loss": 0.4992, "step": 26850 }, { "epoch": 2.6062487871143025, "grad_norm": 1.8637396133321595, "learning_rate": 6.583814653773769e-06, "loss": 0.4397, "step": 26860 }, { "epoch": 2.607219095672424, "grad_norm": 1.618701731041116, "learning_rate": 6.567590369264716e-06, "loss": 0.4259, "step": 26870 }, { "epoch": 2.6081894042305453, "grad_norm": 1.9096015587966633, "learning_rate": 6.5513660847556625e-06, "loss": 0.4524, "step": 26880 }, { "epoch": 2.6091597127886668, "grad_norm": 2.224519506513324, "learning_rate": 6.53514180024661e-06, "loss": 0.427, "step": 26890 }, { "epoch": 2.610130021346788, "grad_norm": 2.2776265487214356, "learning_rate": 6.518917515737556e-06, "loss": 0.4738, "step": 26900 }, { "epoch": 2.61110032990491, "grad_norm": 2.210641578491222, "learning_rate": 6.502693231228503e-06, "loss": 0.4523, "step": 26910 }, { "epoch": 2.612070638463031, "grad_norm": 2.0544627825148454, "learning_rate": 6.48646894671945e-06, "loss": 0.3989, "step": 26920 }, { "epoch": 2.613040947021153, "grad_norm": 1.8258262217309098, "learning_rate": 6.470244662210396e-06, "loss": 0.4182, "step": 26930 }, { "epoch": 2.6140112555792743, "grad_norm": 1.7871476079308524, "learning_rate": 6.454020377701343e-06, "loss": 0.4596, "step": 26940 }, { "epoch": 2.6149815641373957, "grad_norm": 2.4349613489784225, "learning_rate": 6.43779609319229e-06, "loss": 0.4391, "step": 26950 }, { "epoch": 2.615951872695517, "grad_norm": 2.0784553450473906, "learning_rate": 6.421571808683237e-06, "loss": 0.4623, "step": 26960 }, { "epoch": 2.6169221812536385, "grad_norm": 1.6782863658986358, "learning_rate": 6.405347524174183e-06, "loss": 0.4702, "step": 26970 }, { "epoch": 2.6178924898117604, "grad_norm": 2.0627005377584515, "learning_rate": 6.389123239665131e-06, "loss": 0.4071, "step": 26980 }, { "epoch": 2.618862798369882, "grad_norm": 2.228692956884338, "learning_rate": 6.372898955156079e-06, "loss": 0.4277, "step": 26990 }, { "epoch": 2.619833106928003, "grad_norm": 2.260660831872594, "learning_rate": 6.356674670647025e-06, "loss": 0.4559, "step": 27000 }, { "epoch": 2.619833106928003, "eval_loss": 0.6265242695808411, "eval_runtime": 3077.6673, "eval_samples_per_second": 0.582, "eval_steps_per_second": 0.291, "step": 27000 }, { "epoch": 2.6208034154861246, "grad_norm": 2.0916943763147495, "learning_rate": 6.340450386137972e-06, "loss": 0.5057, "step": 27010 }, { "epoch": 2.621773724044246, "grad_norm": 1.9088165017857428, "learning_rate": 6.324226101628919e-06, "loss": 0.4894, "step": 27020 }, { "epoch": 2.6227440326023674, "grad_norm": 2.143958468951753, "learning_rate": 6.308001817119866e-06, "loss": 0.495, "step": 27030 }, { "epoch": 2.623714341160489, "grad_norm": 1.9473747049018377, "learning_rate": 6.291777532610812e-06, "loss": 0.4384, "step": 27040 }, { "epoch": 2.6246846497186107, "grad_norm": 1.8217324370349153, "learning_rate": 6.2755532481017595e-06, "loss": 0.458, "step": 27050 }, { "epoch": 2.625654958276732, "grad_norm": 2.1374968806122974, "learning_rate": 6.259328963592706e-06, "loss": 0.4396, "step": 27060 }, { "epoch": 2.6266252668348535, "grad_norm": 2.0004473353416197, "learning_rate": 6.243104679083653e-06, "loss": 0.447, "step": 27070 }, { "epoch": 2.627595575392975, "grad_norm": 2.0875236661724377, "learning_rate": 6.2268803945745995e-06, "loss": 0.4598, "step": 27080 }, { "epoch": 2.6285658839510964, "grad_norm": 1.4975583369058618, "learning_rate": 6.210656110065547e-06, "loss": 0.4394, "step": 27090 }, { "epoch": 2.6295361925092178, "grad_norm": 1.7719004882403615, "learning_rate": 6.194431825556493e-06, "loss": 0.4572, "step": 27100 }, { "epoch": 2.630506501067339, "grad_norm": 1.75138803938062, "learning_rate": 6.17820754104744e-06, "loss": 0.4207, "step": 27110 }, { "epoch": 2.631476809625461, "grad_norm": 2.328411365261218, "learning_rate": 6.161983256538387e-06, "loss": 0.4613, "step": 27120 }, { "epoch": 2.6324471181835825, "grad_norm": 2.6125153987057406, "learning_rate": 6.145758972029333e-06, "loss": 0.5289, "step": 27130 }, { "epoch": 2.633417426741704, "grad_norm": 2.0526398963778703, "learning_rate": 6.129534687520281e-06, "loss": 0.4533, "step": 27140 }, { "epoch": 2.6343877352998253, "grad_norm": 1.9866810774018402, "learning_rate": 6.1133104030112276e-06, "loss": 0.4154, "step": 27150 }, { "epoch": 2.6353580438579467, "grad_norm": 2.4795632135874754, "learning_rate": 6.097086118502175e-06, "loss": 0.495, "step": 27160 }, { "epoch": 2.6363283524160686, "grad_norm": 2.2566551484246933, "learning_rate": 6.080861833993121e-06, "loss": 0.5013, "step": 27170 }, { "epoch": 2.6372986609741895, "grad_norm": 1.7690014101548142, "learning_rate": 6.064637549484068e-06, "loss": 0.4791, "step": 27180 }, { "epoch": 2.6382689695323114, "grad_norm": 1.7917925386398774, "learning_rate": 6.048413264975015e-06, "loss": 0.4604, "step": 27190 }, { "epoch": 2.639239278090433, "grad_norm": 2.605089481537401, "learning_rate": 6.032188980465962e-06, "loss": 0.4406, "step": 27200 }, { "epoch": 2.640209586648554, "grad_norm": 2.6346337281388976, "learning_rate": 6.015964695956908e-06, "loss": 0.4569, "step": 27210 }, { "epoch": 2.6411798952066756, "grad_norm": 2.356192113804472, "learning_rate": 5.999740411447855e-06, "loss": 0.5287, "step": 27220 }, { "epoch": 2.642150203764797, "grad_norm": 1.75287937703708, "learning_rate": 5.983516126938802e-06, "loss": 0.4158, "step": 27230 }, { "epoch": 2.643120512322919, "grad_norm": 1.9667674004416014, "learning_rate": 5.967291842429749e-06, "loss": 0.4398, "step": 27240 }, { "epoch": 2.6440908208810403, "grad_norm": 1.6605391867275763, "learning_rate": 5.9510675579206965e-06, "loss": 0.4402, "step": 27250 }, { "epoch": 2.6450611294391617, "grad_norm": 2.86103929372877, "learning_rate": 5.934843273411643e-06, "loss": 0.4861, "step": 27260 }, { "epoch": 2.646031437997283, "grad_norm": 2.2937442610124217, "learning_rate": 5.91861898890259e-06, "loss": 0.4358, "step": 27270 }, { "epoch": 2.6470017465554045, "grad_norm": 2.0761751912314685, "learning_rate": 5.9023947043935365e-06, "loss": 0.4608, "step": 27280 }, { "epoch": 2.647972055113526, "grad_norm": 2.2831659042363985, "learning_rate": 5.886170419884484e-06, "loss": 0.4792, "step": 27290 }, { "epoch": 2.6489423636716474, "grad_norm": 2.1893852849473783, "learning_rate": 5.86994613537543e-06, "loss": 0.4816, "step": 27300 }, { "epoch": 2.6499126722297692, "grad_norm": 2.0094924259039884, "learning_rate": 5.8537218508663765e-06, "loss": 0.4531, "step": 27310 }, { "epoch": 2.6508829807878906, "grad_norm": 1.6511270189112646, "learning_rate": 5.837497566357324e-06, "loss": 0.4354, "step": 27320 }, { "epoch": 2.651853289346012, "grad_norm": 2.0974960389467867, "learning_rate": 5.821273281848271e-06, "loss": 0.4744, "step": 27330 }, { "epoch": 2.6528235979041335, "grad_norm": 2.0327735679574945, "learning_rate": 5.805048997339218e-06, "loss": 0.4533, "step": 27340 }, { "epoch": 2.653793906462255, "grad_norm": 2.065148492434501, "learning_rate": 5.7888247128301645e-06, "loss": 0.4301, "step": 27350 }, { "epoch": 2.6547642150203767, "grad_norm": 1.6135569555911147, "learning_rate": 5.772600428321112e-06, "loss": 0.4263, "step": 27360 }, { "epoch": 2.6557345235784977, "grad_norm": 2.312598613502463, "learning_rate": 5.756376143812058e-06, "loss": 0.4425, "step": 27370 }, { "epoch": 2.6567048321366196, "grad_norm": 2.1271258405405504, "learning_rate": 5.740151859303005e-06, "loss": 0.532, "step": 27380 }, { "epoch": 2.657675140694741, "grad_norm": 1.9920221138599952, "learning_rate": 5.723927574793952e-06, "loss": 0.5004, "step": 27390 }, { "epoch": 2.6586454492528624, "grad_norm": 1.8128823123895512, "learning_rate": 5.707703290284899e-06, "loss": 0.4768, "step": 27400 }, { "epoch": 2.659615757810984, "grad_norm": 2.1482597960934924, "learning_rate": 5.691479005775845e-06, "loss": 0.4365, "step": 27410 }, { "epoch": 2.660586066369105, "grad_norm": 2.0557889289828597, "learning_rate": 5.675254721266792e-06, "loss": 0.4701, "step": 27420 }, { "epoch": 2.661556374927227, "grad_norm": 2.3778742440264473, "learning_rate": 5.65903043675774e-06, "loss": 0.434, "step": 27430 }, { "epoch": 2.6625266834853485, "grad_norm": 2.06624372721149, "learning_rate": 5.642806152248686e-06, "loss": 0.4562, "step": 27440 }, { "epoch": 2.66349699204347, "grad_norm": 2.5677503879903028, "learning_rate": 5.6265818677396334e-06, "loss": 0.4514, "step": 27450 }, { "epoch": 2.6644673006015913, "grad_norm": 2.1662148907368435, "learning_rate": 5.61035758323058e-06, "loss": 0.4807, "step": 27460 }, { "epoch": 2.6654376091597127, "grad_norm": 2.3514376976813534, "learning_rate": 5.594133298721527e-06, "loss": 0.4544, "step": 27470 }, { "epoch": 2.666407917717834, "grad_norm": 1.9936514975140516, "learning_rate": 5.5779090142124734e-06, "loss": 0.4585, "step": 27480 }, { "epoch": 2.6673782262759556, "grad_norm": 2.333872871364999, "learning_rate": 5.561684729703421e-06, "loss": 0.425, "step": 27490 }, { "epoch": 2.6683485348340774, "grad_norm": 2.046202909310476, "learning_rate": 5.545460445194367e-06, "loss": 0.4358, "step": 27500 }, { "epoch": 2.6683485348340774, "eval_loss": 0.6262698769569397, "eval_runtime": 3075.4574, "eval_samples_per_second": 0.583, "eval_steps_per_second": 0.291, "step": 27500 }, { "epoch": 2.669318843392199, "grad_norm": 2.142468059686056, "learning_rate": 5.529236160685313e-06, "loss": 0.4074, "step": 27510 }, { "epoch": 2.6702891519503202, "grad_norm": 2.362721384452117, "learning_rate": 5.513011876176261e-06, "loss": 0.4555, "step": 27520 }, { "epoch": 2.6712594605084417, "grad_norm": 2.358046553519279, "learning_rate": 5.496787591667208e-06, "loss": 0.4976, "step": 27530 }, { "epoch": 2.672229769066563, "grad_norm": 2.429874027725808, "learning_rate": 5.480563307158155e-06, "loss": 0.4664, "step": 27540 }, { "epoch": 2.673200077624685, "grad_norm": 1.9576896980154457, "learning_rate": 5.4643390226491015e-06, "loss": 0.4964, "step": 27550 }, { "epoch": 2.674170386182806, "grad_norm": 2.379637675056002, "learning_rate": 5.448114738140049e-06, "loss": 0.4322, "step": 27560 }, { "epoch": 2.6751406947409277, "grad_norm": 2.0518643985265763, "learning_rate": 5.431890453630995e-06, "loss": 0.4675, "step": 27570 }, { "epoch": 2.676111003299049, "grad_norm": 2.302945792516606, "learning_rate": 5.415666169121942e-06, "loss": 0.4334, "step": 27580 }, { "epoch": 2.6770813118571706, "grad_norm": 2.1010724003665033, "learning_rate": 5.399441884612889e-06, "loss": 0.5061, "step": 27590 }, { "epoch": 2.678051620415292, "grad_norm": 2.176927437873621, "learning_rate": 5.383217600103835e-06, "loss": 0.4383, "step": 27600 }, { "epoch": 2.6790219289734134, "grad_norm": 2.390870268803028, "learning_rate": 5.366993315594782e-06, "loss": 0.4164, "step": 27610 }, { "epoch": 2.6799922375315353, "grad_norm": 2.1498792271809126, "learning_rate": 5.350769031085729e-06, "loss": 0.4282, "step": 27620 }, { "epoch": 2.6809625460896567, "grad_norm": 2.237938360410985, "learning_rate": 5.334544746576677e-06, "loss": 0.4281, "step": 27630 }, { "epoch": 2.681932854647778, "grad_norm": 1.895585738408493, "learning_rate": 5.318320462067623e-06, "loss": 0.4797, "step": 27640 }, { "epoch": 2.6829031632058995, "grad_norm": 1.9798973531305697, "learning_rate": 5.30209617755857e-06, "loss": 0.4891, "step": 27650 }, { "epoch": 2.683873471764021, "grad_norm": 1.8137430085787976, "learning_rate": 5.285871893049517e-06, "loss": 0.4738, "step": 27660 }, { "epoch": 2.6848437803221423, "grad_norm": 2.5348762541455683, "learning_rate": 5.269647608540464e-06, "loss": 0.4411, "step": 27670 }, { "epoch": 2.6858140888802637, "grad_norm": 2.0560974006481434, "learning_rate": 5.25342332403141e-06, "loss": 0.4621, "step": 27680 }, { "epoch": 2.6867843974383856, "grad_norm": 2.042608730513543, "learning_rate": 5.237199039522357e-06, "loss": 0.5004, "step": 27690 }, { "epoch": 2.687754705996507, "grad_norm": 1.8274478779694128, "learning_rate": 5.220974755013304e-06, "loss": 0.4253, "step": 27700 }, { "epoch": 2.6887250145546284, "grad_norm": 2.048526748970328, "learning_rate": 5.20475047050425e-06, "loss": 0.4388, "step": 27710 }, { "epoch": 2.68969532311275, "grad_norm": 1.6967933553007366, "learning_rate": 5.188526185995198e-06, "loss": 0.4238, "step": 27720 }, { "epoch": 2.6906656316708712, "grad_norm": 2.274135242406485, "learning_rate": 5.172301901486145e-06, "loss": 0.4681, "step": 27730 }, { "epoch": 2.6916359402289927, "grad_norm": 2.078510231220565, "learning_rate": 5.156077616977092e-06, "loss": 0.4838, "step": 27740 }, { "epoch": 2.692606248787114, "grad_norm": 2.0817065896920224, "learning_rate": 5.1398533324680385e-06, "loss": 0.4428, "step": 27750 }, { "epoch": 2.693576557345236, "grad_norm": 2.2512847900656583, "learning_rate": 5.123629047958986e-06, "loss": 0.4429, "step": 27760 }, { "epoch": 2.6945468659033573, "grad_norm": 1.1429132262434476, "learning_rate": 5.107404763449932e-06, "loss": 0.4615, "step": 27770 }, { "epoch": 2.6955171744614788, "grad_norm": 1.756148813084678, "learning_rate": 5.091180478940879e-06, "loss": 0.4248, "step": 27780 }, { "epoch": 2.6964874830196, "grad_norm": 2.0414574495400504, "learning_rate": 5.074956194431826e-06, "loss": 0.4736, "step": 27790 }, { "epoch": 2.6974577915777216, "grad_norm": 1.807258238700552, "learning_rate": 5.058731909922772e-06, "loss": 0.4502, "step": 27800 }, { "epoch": 2.6984281001358434, "grad_norm": 2.3181866298162754, "learning_rate": 5.042507625413719e-06, "loss": 0.4732, "step": 27810 }, { "epoch": 2.6993984086939644, "grad_norm": 2.23238964893833, "learning_rate": 5.0262833409046665e-06, "loss": 0.4911, "step": 27820 }, { "epoch": 2.7003687172520863, "grad_norm": 2.4704315007732656, "learning_rate": 5.010059056395614e-06, "loss": 0.3893, "step": 27830 }, { "epoch": 2.7013390258102077, "grad_norm": 2.103825766373529, "learning_rate": 4.99383477188656e-06, "loss": 0.4738, "step": 27840 }, { "epoch": 2.702309334368329, "grad_norm": 1.8267381387121444, "learning_rate": 4.977610487377507e-06, "loss": 0.4507, "step": 27850 }, { "epoch": 2.7032796429264505, "grad_norm": 2.9282048804626397, "learning_rate": 4.961386202868454e-06, "loss": 0.4572, "step": 27860 }, { "epoch": 2.704249951484572, "grad_norm": 2.1905551359384994, "learning_rate": 4.945161918359401e-06, "loss": 0.4165, "step": 27870 }, { "epoch": 2.705220260042694, "grad_norm": 2.0122828099696397, "learning_rate": 4.928937633850347e-06, "loss": 0.4592, "step": 27880 }, { "epoch": 2.706190568600815, "grad_norm": 2.1103570036325534, "learning_rate": 4.912713349341294e-06, "loss": 0.4811, "step": 27890 }, { "epoch": 2.7071608771589366, "grad_norm": 2.134559653048109, "learning_rate": 4.896489064832241e-06, "loss": 0.4592, "step": 27900 }, { "epoch": 2.708131185717058, "grad_norm": 1.9012812893178395, "learning_rate": 4.880264780323187e-06, "loss": 0.4494, "step": 27910 }, { "epoch": 2.7091014942751794, "grad_norm": 2.173892363265671, "learning_rate": 4.8640404958141354e-06, "loss": 0.4662, "step": 27920 }, { "epoch": 2.710071802833301, "grad_norm": 2.091122535871686, "learning_rate": 4.847816211305082e-06, "loss": 0.5142, "step": 27930 }, { "epoch": 2.7110421113914223, "grad_norm": 1.6486354925817308, "learning_rate": 4.831591926796029e-06, "loss": 0.4485, "step": 27940 }, { "epoch": 2.712012419949544, "grad_norm": 2.0239775829976914, "learning_rate": 4.8153676422869754e-06, "loss": 0.4429, "step": 27950 }, { "epoch": 2.7129827285076655, "grad_norm": 1.9983102332503897, "learning_rate": 4.799143357777923e-06, "loss": 0.4747, "step": 27960 }, { "epoch": 2.713953037065787, "grad_norm": 1.8879150821349495, "learning_rate": 4.782919073268869e-06, "loss": 0.4503, "step": 27970 }, { "epoch": 2.7149233456239084, "grad_norm": 2.5111602400330275, "learning_rate": 4.766694788759815e-06, "loss": 0.39, "step": 27980 }, { "epoch": 2.7158936541820298, "grad_norm": 1.8358780945412772, "learning_rate": 4.750470504250763e-06, "loss": 0.4408, "step": 27990 }, { "epoch": 2.7168639627401516, "grad_norm": 1.8713813304884717, "learning_rate": 4.734246219741709e-06, "loss": 0.4325, "step": 28000 }, { "epoch": 2.7168639627401516, "eval_loss": 0.6255120038986206, "eval_runtime": 3134.5308, "eval_samples_per_second": 0.572, "eval_steps_per_second": 0.286, "step": 28000 }, { "epoch": 2.7178342712982726, "grad_norm": 2.2407549956214776, "learning_rate": 4.718021935232656e-06, "loss": 0.4584, "step": 28010 }, { "epoch": 2.7188045798563945, "grad_norm": 2.0789438434805914, "learning_rate": 4.7017976507236035e-06, "loss": 0.5145, "step": 28020 }, { "epoch": 2.719774888414516, "grad_norm": 1.953437458993961, "learning_rate": 4.685573366214551e-06, "loss": 0.4461, "step": 28030 }, { "epoch": 2.7207451969726373, "grad_norm": 2.2690258975305766, "learning_rate": 4.669349081705497e-06, "loss": 0.4345, "step": 28040 }, { "epoch": 2.7217155055307587, "grad_norm": 2.472665029273779, "learning_rate": 4.653124797196444e-06, "loss": 0.4192, "step": 28050 }, { "epoch": 2.72268581408888, "grad_norm": 2.369484805983746, "learning_rate": 4.636900512687391e-06, "loss": 0.3896, "step": 28060 }, { "epoch": 2.723656122647002, "grad_norm": 1.5439527640270017, "learning_rate": 4.620676228178337e-06, "loss": 0.4277, "step": 28070 }, { "epoch": 2.7246264312051234, "grad_norm": 1.6475093472819964, "learning_rate": 4.604451943669284e-06, "loss": 0.5008, "step": 28080 }, { "epoch": 2.725596739763245, "grad_norm": 1.9729369776568129, "learning_rate": 4.588227659160231e-06, "loss": 0.4832, "step": 28090 }, { "epoch": 2.726567048321366, "grad_norm": 1.8249644804468674, "learning_rate": 4.572003374651178e-06, "loss": 0.4766, "step": 28100 }, { "epoch": 2.7275373568794876, "grad_norm": 2.0428245799786326, "learning_rate": 4.555779090142124e-06, "loss": 0.37, "step": 28110 }, { "epoch": 2.728507665437609, "grad_norm": 2.3067653567941195, "learning_rate": 4.539554805633072e-06, "loss": 0.4915, "step": 28120 }, { "epoch": 2.7294779739957304, "grad_norm": 2.508385066029743, "learning_rate": 4.523330521124019e-06, "loss": 0.4669, "step": 28130 }, { "epoch": 2.7304482825538523, "grad_norm": 2.4008677729061274, "learning_rate": 4.507106236614966e-06, "loss": 0.4387, "step": 28140 }, { "epoch": 2.7314185911119737, "grad_norm": 1.9860137417678747, "learning_rate": 4.490881952105912e-06, "loss": 0.5287, "step": 28150 }, { "epoch": 2.732388899670095, "grad_norm": 1.9556909839133516, "learning_rate": 4.474657667596859e-06, "loss": 0.4513, "step": 28160 }, { "epoch": 2.7333592082282165, "grad_norm": 2.379748735610722, "learning_rate": 4.458433383087806e-06, "loss": 0.4611, "step": 28170 }, { "epoch": 2.734329516786338, "grad_norm": 1.5081631528264992, "learning_rate": 4.442209098578752e-06, "loss": 0.4342, "step": 28180 }, { "epoch": 2.73529982534446, "grad_norm": 2.4579236546327436, "learning_rate": 4.4259848140697e-06, "loss": 0.4906, "step": 28190 }, { "epoch": 2.736270133902581, "grad_norm": 2.520647631978861, "learning_rate": 4.409760529560646e-06, "loss": 0.4263, "step": 28200 }, { "epoch": 2.7372404424607026, "grad_norm": 2.513810505246578, "learning_rate": 4.393536245051594e-06, "loss": 0.4588, "step": 28210 }, { "epoch": 2.738210751018824, "grad_norm": 2.590482052355743, "learning_rate": 4.3773119605425405e-06, "loss": 0.4406, "step": 28220 }, { "epoch": 2.7391810595769455, "grad_norm": 2.1382620328102844, "learning_rate": 4.361087676033488e-06, "loss": 0.4925, "step": 28230 }, { "epoch": 2.740151368135067, "grad_norm": 1.1577032979842956, "learning_rate": 4.344863391524434e-06, "loss": 0.436, "step": 28240 }, { "epoch": 2.7411216766931883, "grad_norm": 2.1619456500192236, "learning_rate": 4.328639107015381e-06, "loss": 0.4263, "step": 28250 }, { "epoch": 2.74209198525131, "grad_norm": 2.0853557049385896, "learning_rate": 4.312414822506328e-06, "loss": 0.4901, "step": 28260 }, { "epoch": 2.7430622938094316, "grad_norm": 1.4192124445752605, "learning_rate": 4.296190537997274e-06, "loss": 0.4232, "step": 28270 }, { "epoch": 2.744032602367553, "grad_norm": 2.132418029742112, "learning_rate": 4.279966253488221e-06, "loss": 0.4826, "step": 28280 }, { "epoch": 2.7450029109256744, "grad_norm": 1.9769172617983983, "learning_rate": 4.263741968979168e-06, "loss": 0.4063, "step": 28290 }, { "epoch": 2.745973219483796, "grad_norm": 1.668270632688926, "learning_rate": 4.247517684470115e-06, "loss": 0.4467, "step": 28300 }, { "epoch": 2.746943528041917, "grad_norm": 2.4864744259346643, "learning_rate": 4.231293399961062e-06, "loss": 0.4827, "step": 28310 }, { "epoch": 2.7479138366000386, "grad_norm": 2.0299956671713084, "learning_rate": 4.215069115452009e-06, "loss": 0.4302, "step": 28320 }, { "epoch": 2.7488841451581605, "grad_norm": 1.7927586957376267, "learning_rate": 4.198844830942956e-06, "loss": 0.4597, "step": 28330 }, { "epoch": 2.749854453716282, "grad_norm": 1.8885257288009205, "learning_rate": 4.182620546433903e-06, "loss": 0.4456, "step": 28340 }, { "epoch": 2.7508247622744033, "grad_norm": 1.9141264783862757, "learning_rate": 4.166396261924849e-06, "loss": 0.4361, "step": 28350 }, { "epoch": 2.7517950708325247, "grad_norm": 2.11932477410823, "learning_rate": 4.150171977415796e-06, "loss": 0.4366, "step": 28360 }, { "epoch": 2.752765379390646, "grad_norm": 2.3047392463301106, "learning_rate": 4.133947692906743e-06, "loss": 0.4675, "step": 28370 }, { "epoch": 2.7537356879487676, "grad_norm": 2.3432165119649877, "learning_rate": 4.117723408397689e-06, "loss": 0.4664, "step": 28380 }, { "epoch": 2.754705996506889, "grad_norm": 2.431541461857591, "learning_rate": 4.101499123888637e-06, "loss": 0.4823, "step": 28390 }, { "epoch": 2.755676305065011, "grad_norm": 1.8914744547120323, "learning_rate": 4.085274839379583e-06, "loss": 0.5011, "step": 28400 }, { "epoch": 2.7566466136231322, "grad_norm": 1.7019623934614223, "learning_rate": 4.069050554870531e-06, "loss": 0.4382, "step": 28410 }, { "epoch": 2.7576169221812536, "grad_norm": 1.7049694786864458, "learning_rate": 4.0528262703614774e-06, "loss": 0.464, "step": 28420 }, { "epoch": 2.758587230739375, "grad_norm": 2.075329865566019, "learning_rate": 4.036601985852425e-06, "loss": 0.4447, "step": 28430 }, { "epoch": 2.7595575392974965, "grad_norm": 1.6212853711234059, "learning_rate": 4.020377701343371e-06, "loss": 0.4437, "step": 28440 }, { "epoch": 2.7605278478556183, "grad_norm": 2.307787314077055, "learning_rate": 4.004153416834317e-06, "loss": 0.4802, "step": 28450 }, { "epoch": 2.7614981564137393, "grad_norm": 2.4906214075497877, "learning_rate": 3.987929132325265e-06, "loss": 0.4128, "step": 28460 }, { "epoch": 2.762468464971861, "grad_norm": 1.9557039343662834, "learning_rate": 3.971704847816211e-06, "loss": 0.4797, "step": 28470 }, { "epoch": 2.7634387735299826, "grad_norm": 2.214689720042895, "learning_rate": 3.955480563307158e-06, "loss": 0.4438, "step": 28480 }, { "epoch": 2.764409082088104, "grad_norm": 1.8160233389388076, "learning_rate": 3.939256278798105e-06, "loss": 0.4577, "step": 28490 }, { "epoch": 2.7653793906462254, "grad_norm": 2.302171864396486, "learning_rate": 3.923031994289052e-06, "loss": 0.4358, "step": 28500 }, { "epoch": 2.7653793906462254, "eval_loss": 0.6244751811027527, "eval_runtime": 3077.993, "eval_samples_per_second": 0.582, "eval_steps_per_second": 0.291, "step": 28500 }, { "epoch": 2.766349699204347, "grad_norm": 2.272905462445502, "learning_rate": 3.906807709779999e-06, "loss": 0.4651, "step": 28510 }, { "epoch": 2.7673200077624687, "grad_norm": 2.3769636430382093, "learning_rate": 3.890583425270946e-06, "loss": 0.448, "step": 28520 }, { "epoch": 2.76829031632059, "grad_norm": 1.7989924386221376, "learning_rate": 3.874359140761893e-06, "loss": 0.4421, "step": 28530 }, { "epoch": 2.7692606248787115, "grad_norm": 2.0594639817252713, "learning_rate": 3.858134856252839e-06, "loss": 0.4129, "step": 28540 }, { "epoch": 2.770230933436833, "grad_norm": 2.275157308506755, "learning_rate": 3.841910571743786e-06, "loss": 0.4715, "step": 28550 }, { "epoch": 2.7712012419949543, "grad_norm": 2.0822414974647883, "learning_rate": 3.825686287234733e-06, "loss": 0.467, "step": 28560 }, { "epoch": 2.7721715505530757, "grad_norm": 2.4308808157297137, "learning_rate": 3.80946200272568e-06, "loss": 0.4533, "step": 28570 }, { "epoch": 2.773141859111197, "grad_norm": 1.963360367635364, "learning_rate": 3.7932377182166267e-06, "loss": 0.4522, "step": 28580 }, { "epoch": 2.774112167669319, "grad_norm": 2.245634314977675, "learning_rate": 3.7770134337075736e-06, "loss": 0.4545, "step": 28590 }, { "epoch": 2.7750824762274404, "grad_norm": 2.1669310451370825, "learning_rate": 3.7607891491985204e-06, "loss": 0.4254, "step": 28600 }, { "epoch": 2.776052784785562, "grad_norm": 2.1974484688266336, "learning_rate": 3.7445648646894676e-06, "loss": 0.4756, "step": 28610 }, { "epoch": 2.7770230933436832, "grad_norm": 1.9401881488996764, "learning_rate": 3.7283405801804144e-06, "loss": 0.4126, "step": 28620 }, { "epoch": 2.7779934019018047, "grad_norm": 1.7058980334254865, "learning_rate": 3.712116295671361e-06, "loss": 0.3833, "step": 28630 }, { "epoch": 2.7789637104599265, "grad_norm": 2.3370574860755466, "learning_rate": 3.695892011162308e-06, "loss": 0.4855, "step": 28640 }, { "epoch": 2.7799340190180475, "grad_norm": 2.1324366080197543, "learning_rate": 3.679667726653255e-06, "loss": 0.4615, "step": 28650 }, { "epoch": 2.7809043275761693, "grad_norm": 1.7988440382758561, "learning_rate": 3.6634434421442016e-06, "loss": 0.4178, "step": 28660 }, { "epoch": 2.7818746361342908, "grad_norm": 1.5545144152138595, "learning_rate": 3.6472191576351484e-06, "loss": 0.4525, "step": 28670 }, { "epoch": 2.782844944692412, "grad_norm": 2.1473173398790757, "learning_rate": 3.6309948731260952e-06, "loss": 0.4611, "step": 28680 }, { "epoch": 2.7838152532505336, "grad_norm": 2.8061447248530267, "learning_rate": 3.614770588617042e-06, "loss": 0.5126, "step": 28690 }, { "epoch": 2.784785561808655, "grad_norm": 1.9111124340369268, "learning_rate": 3.5985463041079893e-06, "loss": 0.4367, "step": 28700 }, { "epoch": 2.785755870366777, "grad_norm": 1.9392576645450146, "learning_rate": 3.582322019598936e-06, "loss": 0.4621, "step": 28710 }, { "epoch": 2.7867261789248983, "grad_norm": 1.8014139876919042, "learning_rate": 3.566097735089883e-06, "loss": 0.4636, "step": 28720 }, { "epoch": 2.7876964874830197, "grad_norm": 1.7521076570950156, "learning_rate": 3.5498734505808297e-06, "loss": 0.4833, "step": 28730 }, { "epoch": 2.788666796041141, "grad_norm": 2.128782120478336, "learning_rate": 3.5336491660717765e-06, "loss": 0.4403, "step": 28740 }, { "epoch": 2.7896371045992625, "grad_norm": 2.023872840505275, "learning_rate": 3.5174248815627233e-06, "loss": 0.4624, "step": 28750 }, { "epoch": 2.790607413157384, "grad_norm": 1.7812909010350144, "learning_rate": 3.50120059705367e-06, "loss": 0.4548, "step": 28760 }, { "epoch": 2.7915777217155053, "grad_norm": 2.361371274457747, "learning_rate": 3.484976312544617e-06, "loss": 0.4063, "step": 28770 }, { "epoch": 2.792548030273627, "grad_norm": 2.125467928143783, "learning_rate": 3.4687520280355637e-06, "loss": 0.4472, "step": 28780 }, { "epoch": 2.7935183388317486, "grad_norm": 2.2019772852671093, "learning_rate": 3.4525277435265105e-06, "loss": 0.416, "step": 28790 }, { "epoch": 2.79448864738987, "grad_norm": 1.899213031924614, "learning_rate": 3.4363034590174578e-06, "loss": 0.4404, "step": 28800 }, { "epoch": 2.7954589559479914, "grad_norm": 2.1095155555214866, "learning_rate": 3.4200791745084046e-06, "loss": 0.4216, "step": 28810 }, { "epoch": 2.796429264506113, "grad_norm": 2.094921408819589, "learning_rate": 3.4038548899993514e-06, "loss": 0.4583, "step": 28820 }, { "epoch": 2.7973995730642347, "grad_norm": 1.8731980506653032, "learning_rate": 3.387630605490298e-06, "loss": 0.4009, "step": 28830 }, { "epoch": 2.7983698816223557, "grad_norm": 2.061101723285212, "learning_rate": 3.371406320981245e-06, "loss": 0.4288, "step": 28840 }, { "epoch": 2.7993401901804775, "grad_norm": 2.315406035448941, "learning_rate": 3.3551820364721918e-06, "loss": 0.3685, "step": 28850 }, { "epoch": 2.800310498738599, "grad_norm": 2.4466340315196984, "learning_rate": 3.3389577519631386e-06, "loss": 0.4544, "step": 28860 }, { "epoch": 2.8012808072967204, "grad_norm": 2.5309005015772703, "learning_rate": 3.3227334674540854e-06, "loss": 0.3991, "step": 28870 }, { "epoch": 2.8022511158548418, "grad_norm": 2.33786885041076, "learning_rate": 3.306509182945032e-06, "loss": 0.4075, "step": 28880 }, { "epoch": 2.803221424412963, "grad_norm": 1.7070542723191706, "learning_rate": 3.290284898435979e-06, "loss": 0.4672, "step": 28890 }, { "epoch": 2.804191732971085, "grad_norm": 2.2766414030092563, "learning_rate": 3.2740606139269262e-06, "loss": 0.3913, "step": 28900 }, { "epoch": 2.8051620415292065, "grad_norm": 1.9290090208955126, "learning_rate": 3.257836329417873e-06, "loss": 0.518, "step": 28910 }, { "epoch": 2.806132350087328, "grad_norm": 2.5738723062700752, "learning_rate": 3.24161204490882e-06, "loss": 0.4894, "step": 28920 }, { "epoch": 2.8071026586454493, "grad_norm": 2.2041758930923128, "learning_rate": 3.2253877603997667e-06, "loss": 0.4158, "step": 28930 }, { "epoch": 2.8080729672035707, "grad_norm": 2.0477210954761764, "learning_rate": 3.2091634758907135e-06, "loss": 0.4124, "step": 28940 }, { "epoch": 2.809043275761692, "grad_norm": 2.2106706440190336, "learning_rate": 3.1929391913816603e-06, "loss": 0.3813, "step": 28950 }, { "epoch": 2.8100135843198135, "grad_norm": 2.1260352879701934, "learning_rate": 3.176714906872607e-06, "loss": 0.4287, "step": 28960 }, { "epoch": 2.8109838928779354, "grad_norm": 1.9992896056229097, "learning_rate": 3.160490622363554e-06, "loss": 0.4328, "step": 28970 }, { "epoch": 2.811954201436057, "grad_norm": 2.159152949731413, "learning_rate": 3.1442663378545007e-06, "loss": 0.4293, "step": 28980 }, { "epoch": 2.812924509994178, "grad_norm": 2.2374009999442785, "learning_rate": 3.1280420533454475e-06, "loss": 0.4617, "step": 28990 }, { "epoch": 2.8138948185522996, "grad_norm": 1.8202081649631938, "learning_rate": 3.1118177688363943e-06, "loss": 0.3875, "step": 29000 }, { "epoch": 2.8138948185522996, "eval_loss": 0.6232675313949585, "eval_runtime": 3073.6868, "eval_samples_per_second": 0.583, "eval_steps_per_second": 0.292, "step": 29000 }, { "epoch": 2.814865127110421, "grad_norm": 1.9975345056123566, "learning_rate": 3.095593484327341e-06, "loss": 0.5079, "step": 29010 }, { "epoch": 2.8158354356685424, "grad_norm": 2.074375302030034, "learning_rate": 3.0793691998182883e-06, "loss": 0.4577, "step": 29020 }, { "epoch": 2.816805744226664, "grad_norm": 2.218584417722533, "learning_rate": 3.063144915309235e-06, "loss": 0.4333, "step": 29030 }, { "epoch": 2.8177760527847857, "grad_norm": 2.2728312790217324, "learning_rate": 3.046920630800182e-06, "loss": 0.4042, "step": 29040 }, { "epoch": 2.818746361342907, "grad_norm": 2.421120322910574, "learning_rate": 3.0306963462911288e-06, "loss": 0.5176, "step": 29050 }, { "epoch": 2.8197166699010285, "grad_norm": 1.5899860240226635, "learning_rate": 3.0144720617820756e-06, "loss": 0.4099, "step": 29060 }, { "epoch": 2.82068697845915, "grad_norm": 2.286571047440197, "learning_rate": 2.9982477772730224e-06, "loss": 0.4452, "step": 29070 }, { "epoch": 2.8216572870172714, "grad_norm": 2.0098306219601394, "learning_rate": 2.982023492763969e-06, "loss": 0.4358, "step": 29080 }, { "epoch": 2.8226275955753932, "grad_norm": 1.9412581005671086, "learning_rate": 2.965799208254916e-06, "loss": 0.3856, "step": 29090 }, { "epoch": 2.823597904133514, "grad_norm": 1.8848622999343756, "learning_rate": 2.9495749237458628e-06, "loss": 0.4453, "step": 29100 }, { "epoch": 2.824568212691636, "grad_norm": 1.86451722270559, "learning_rate": 2.9333506392368096e-06, "loss": 0.4229, "step": 29110 }, { "epoch": 2.8255385212497575, "grad_norm": 1.7602556440674852, "learning_rate": 2.917126354727757e-06, "loss": 0.4979, "step": 29120 }, { "epoch": 2.826508829807879, "grad_norm": 1.5893550298578953, "learning_rate": 2.9009020702187036e-06, "loss": 0.4528, "step": 29130 }, { "epoch": 2.8274791383660003, "grad_norm": 1.2497217366577578, "learning_rate": 2.8846777857096504e-06, "loss": 0.4502, "step": 29140 }, { "epoch": 2.8284494469241217, "grad_norm": 2.633406180626893, "learning_rate": 2.8684535012005972e-06, "loss": 0.5158, "step": 29150 }, { "epoch": 2.8294197554822436, "grad_norm": 2.000419393386816, "learning_rate": 2.852229216691544e-06, "loss": 0.428, "step": 29160 }, { "epoch": 2.830390064040365, "grad_norm": 2.082613885146073, "learning_rate": 2.836004932182491e-06, "loss": 0.4341, "step": 29170 }, { "epoch": 2.8313603725984864, "grad_norm": 1.9785146782586598, "learning_rate": 2.8197806476734377e-06, "loss": 0.4588, "step": 29180 }, { "epoch": 2.832330681156608, "grad_norm": 1.7903733709971925, "learning_rate": 2.8035563631643845e-06, "loss": 0.4845, "step": 29190 }, { "epoch": 2.833300989714729, "grad_norm": 1.8648175341973643, "learning_rate": 2.7873320786553313e-06, "loss": 0.4345, "step": 29200 }, { "epoch": 2.8342712982728506, "grad_norm": 2.0160560055540158, "learning_rate": 2.771107794146278e-06, "loss": 0.4381, "step": 29210 }, { "epoch": 2.835241606830972, "grad_norm": 2.514031456381022, "learning_rate": 2.7548835096372253e-06, "loss": 0.4773, "step": 29220 }, { "epoch": 2.836211915389094, "grad_norm": 2.451150108341062, "learning_rate": 2.738659225128172e-06, "loss": 0.4306, "step": 29230 }, { "epoch": 2.8371822239472153, "grad_norm": 2.716089438342234, "learning_rate": 2.722434940619119e-06, "loss": 0.4789, "step": 29240 }, { "epoch": 2.8381525325053367, "grad_norm": 2.5884518633116627, "learning_rate": 2.7062106561100657e-06, "loss": 0.4558, "step": 29250 }, { "epoch": 2.839122841063458, "grad_norm": 2.3443286722163132, "learning_rate": 2.6899863716010125e-06, "loss": 0.4246, "step": 29260 }, { "epoch": 2.8400931496215795, "grad_norm": 1.5795028668268227, "learning_rate": 2.6737620870919593e-06, "loss": 0.4335, "step": 29270 }, { "epoch": 2.8410634581797014, "grad_norm": 1.9413322069291443, "learning_rate": 2.657537802582906e-06, "loss": 0.4777, "step": 29280 }, { "epoch": 2.8420337667378224, "grad_norm": 1.5366224490099267, "learning_rate": 2.641313518073853e-06, "loss": 0.4164, "step": 29290 }, { "epoch": 2.8430040752959442, "grad_norm": 2.285360529638414, "learning_rate": 2.6250892335647997e-06, "loss": 0.4562, "step": 29300 }, { "epoch": 2.8439743838540656, "grad_norm": 1.7050148355952945, "learning_rate": 2.608864949055747e-06, "loss": 0.4544, "step": 29310 }, { "epoch": 2.844944692412187, "grad_norm": 2.382556184587553, "learning_rate": 2.5926406645466938e-06, "loss": 0.4316, "step": 29320 }, { "epoch": 2.8459150009703085, "grad_norm": 1.765930000067448, "learning_rate": 2.5764163800376406e-06, "loss": 0.4458, "step": 29330 }, { "epoch": 2.84688530952843, "grad_norm": 1.802624588506912, "learning_rate": 2.5601920955285874e-06, "loss": 0.4756, "step": 29340 }, { "epoch": 2.8478556180865517, "grad_norm": 2.0089001537510467, "learning_rate": 2.543967811019534e-06, "loss": 0.4579, "step": 29350 }, { "epoch": 2.848825926644673, "grad_norm": 1.5966154551376606, "learning_rate": 2.527743526510481e-06, "loss": 0.474, "step": 29360 }, { "epoch": 2.8497962352027946, "grad_norm": 1.7558982614983885, "learning_rate": 2.511519242001428e-06, "loss": 0.4568, "step": 29370 }, { "epoch": 2.850766543760916, "grad_norm": 2.299799026322842, "learning_rate": 2.4952949574923746e-06, "loss": 0.4955, "step": 29380 }, { "epoch": 2.8517368523190374, "grad_norm": 2.1883154677675494, "learning_rate": 2.4790706729833214e-06, "loss": 0.4396, "step": 29390 }, { "epoch": 2.852707160877159, "grad_norm": 2.0178771801525768, "learning_rate": 2.4628463884742682e-06, "loss": 0.4411, "step": 29400 }, { "epoch": 2.8536774694352802, "grad_norm": 1.9703499246094394, "learning_rate": 2.4466221039652155e-06, "loss": 0.536, "step": 29410 }, { "epoch": 2.854647777993402, "grad_norm": 2.1998113464935813, "learning_rate": 2.4303978194561623e-06, "loss": 0.4706, "step": 29420 }, { "epoch": 2.8556180865515235, "grad_norm": 1.8485010374147874, "learning_rate": 2.414173534947109e-06, "loss": 0.4638, "step": 29430 }, { "epoch": 2.856588395109645, "grad_norm": 1.8515182421754246, "learning_rate": 2.397949250438056e-06, "loss": 0.5273, "step": 29440 }, { "epoch": 2.8575587036677663, "grad_norm": 2.631466092544644, "learning_rate": 2.3817249659290027e-06, "loss": 0.4275, "step": 29450 }, { "epoch": 2.8585290122258877, "grad_norm": 2.2252997053117864, "learning_rate": 2.3655006814199495e-06, "loss": 0.4004, "step": 29460 }, { "epoch": 2.8594993207840096, "grad_norm": 2.0807608855175688, "learning_rate": 2.3492763969108963e-06, "loss": 0.4171, "step": 29470 }, { "epoch": 2.8604696293421306, "grad_norm": 1.1761636089725185, "learning_rate": 2.333052112401843e-06, "loss": 0.4603, "step": 29480 }, { "epoch": 2.8614399379002524, "grad_norm": 1.6884064260726555, "learning_rate": 2.31682782789279e-06, "loss": 0.4188, "step": 29490 }, { "epoch": 2.862410246458374, "grad_norm": 2.3472824917752404, "learning_rate": 2.3006035433837367e-06, "loss": 0.4502, "step": 29500 }, { "epoch": 2.862410246458374, "eval_loss": 0.6238746643066406, "eval_runtime": 3075.7627, "eval_samples_per_second": 0.583, "eval_steps_per_second": 0.291, "step": 29500 }, { "epoch": 2.8633805550164952, "grad_norm": 2.5977750166664024, "learning_rate": 2.284379258874684e-06, "loss": 0.4367, "step": 29510 }, { "epoch": 2.8643508635746167, "grad_norm": 2.422920240878537, "learning_rate": 2.2681549743656308e-06, "loss": 0.4491, "step": 29520 }, { "epoch": 2.865321172132738, "grad_norm": 2.2619764113691483, "learning_rate": 2.2519306898565776e-06, "loss": 0.4044, "step": 29530 }, { "epoch": 2.86629148069086, "grad_norm": 2.190155928481078, "learning_rate": 2.2357064053475244e-06, "loss": 0.4607, "step": 29540 }, { "epoch": 2.8672617892489813, "grad_norm": 2.077728136866747, "learning_rate": 2.219482120838471e-06, "loss": 0.463, "step": 29550 }, { "epoch": 2.8682320978071028, "grad_norm": 2.5244135130594114, "learning_rate": 2.203257836329418e-06, "loss": 0.4293, "step": 29560 }, { "epoch": 2.869202406365224, "grad_norm": 1.768046014879214, "learning_rate": 2.1870335518203648e-06, "loss": 0.4452, "step": 29570 }, { "epoch": 2.8701727149233456, "grad_norm": 2.1675297770537014, "learning_rate": 2.1708092673113116e-06, "loss": 0.4522, "step": 29580 }, { "epoch": 2.871143023481467, "grad_norm": 1.9872340195882794, "learning_rate": 2.1545849828022584e-06, "loss": 0.4429, "step": 29590 }, { "epoch": 2.8721133320395884, "grad_norm": 2.0567869664748857, "learning_rate": 2.138360698293205e-06, "loss": 0.4285, "step": 29600 }, { "epoch": 2.8730836405977103, "grad_norm": 1.7606768297067341, "learning_rate": 2.1221364137841524e-06, "loss": 0.447, "step": 29610 }, { "epoch": 2.8740539491558317, "grad_norm": 1.703370684658763, "learning_rate": 2.1059121292750992e-06, "loss": 0.4513, "step": 29620 }, { "epoch": 2.875024257713953, "grad_norm": 1.8390884956958669, "learning_rate": 2.089687844766046e-06, "loss": 0.4326, "step": 29630 }, { "epoch": 2.8759945662720745, "grad_norm": 2.114080260103575, "learning_rate": 2.073463560256993e-06, "loss": 0.4338, "step": 29640 }, { "epoch": 2.876964874830196, "grad_norm": 1.8514108093288673, "learning_rate": 2.0572392757479397e-06, "loss": 0.4668, "step": 29650 }, { "epoch": 2.8779351833883173, "grad_norm": 2.2361386532279734, "learning_rate": 2.0410149912388865e-06, "loss": 0.4297, "step": 29660 }, { "epoch": 2.8789054919464387, "grad_norm": 2.131214768396392, "learning_rate": 2.0247907067298333e-06, "loss": 0.4328, "step": 29670 }, { "epoch": 2.8798758005045606, "grad_norm": 2.1019284181020064, "learning_rate": 2.00856642222078e-06, "loss": 0.4156, "step": 29680 }, { "epoch": 2.880846109062682, "grad_norm": 2.251705196561933, "learning_rate": 1.992342137711727e-06, "loss": 0.4374, "step": 29690 }, { "epoch": 2.8818164176208034, "grad_norm": 2.044305149114161, "learning_rate": 1.9761178532026737e-06, "loss": 0.4781, "step": 29700 }, { "epoch": 2.882786726178925, "grad_norm": 1.866013874663238, "learning_rate": 1.959893568693621e-06, "loss": 0.4165, "step": 29710 }, { "epoch": 2.8837570347370463, "grad_norm": 2.097405296562979, "learning_rate": 1.9436692841845677e-06, "loss": 0.4664, "step": 29720 }, { "epoch": 2.884727343295168, "grad_norm": 2.040045435620021, "learning_rate": 1.9274449996755145e-06, "loss": 0.4846, "step": 29730 }, { "epoch": 2.885697651853289, "grad_norm": 2.2295032295717654, "learning_rate": 1.9112207151664613e-06, "loss": 0.4923, "step": 29740 }, { "epoch": 2.886667960411411, "grad_norm": 2.1305999483641522, "learning_rate": 1.8949964306574084e-06, "loss": 0.4555, "step": 29750 }, { "epoch": 2.8876382689695324, "grad_norm": 2.2827545303462076, "learning_rate": 1.878772146148355e-06, "loss": 0.4335, "step": 29760 }, { "epoch": 2.8886085775276538, "grad_norm": 2.5604788482995966, "learning_rate": 1.8625478616393018e-06, "loss": 0.4496, "step": 29770 }, { "epoch": 2.889578886085775, "grad_norm": 2.3352852370426915, "learning_rate": 1.8463235771302486e-06, "loss": 0.4329, "step": 29780 }, { "epoch": 2.8905491946438966, "grad_norm": 1.792672664561321, "learning_rate": 1.8300992926211954e-06, "loss": 0.4358, "step": 29790 }, { "epoch": 2.8915195032020184, "grad_norm": 1.7906490925426128, "learning_rate": 1.8138750081121426e-06, "loss": 0.4431, "step": 29800 }, { "epoch": 2.89248981176014, "grad_norm": 2.035855534112005, "learning_rate": 1.7976507236030892e-06, "loss": 0.4374, "step": 29810 }, { "epoch": 2.8934601203182613, "grad_norm": 2.1051348631367857, "learning_rate": 1.781426439094036e-06, "loss": 0.5129, "step": 29820 }, { "epoch": 2.8944304288763827, "grad_norm": 2.444624147867575, "learning_rate": 1.7652021545849828e-06, "loss": 0.4146, "step": 29830 }, { "epoch": 2.895400737434504, "grad_norm": 1.5061400844207797, "learning_rate": 1.7489778700759296e-06, "loss": 0.4161, "step": 29840 }, { "epoch": 2.8963710459926255, "grad_norm": 1.9335393298237211, "learning_rate": 1.7327535855668768e-06, "loss": 0.4098, "step": 29850 }, { "epoch": 2.897341354550747, "grad_norm": 1.6109688598334573, "learning_rate": 1.7165293010578234e-06, "loss": 0.3858, "step": 29860 }, { "epoch": 2.898311663108869, "grad_norm": 2.5000803342722806, "learning_rate": 1.7003050165487702e-06, "loss": 0.4644, "step": 29870 }, { "epoch": 2.89928197166699, "grad_norm": 2.0995377332675136, "learning_rate": 1.684080732039717e-06, "loss": 0.4258, "step": 29880 }, { "epoch": 2.9002522802251116, "grad_norm": 2.4217465956185986, "learning_rate": 1.6678564475306638e-06, "loss": 0.4741, "step": 29890 }, { "epoch": 2.901222588783233, "grad_norm": 2.22229495904859, "learning_rate": 1.651632163021611e-06, "loss": 0.4563, "step": 29900 }, { "epoch": 2.9021928973413544, "grad_norm": 2.2752470756307326, "learning_rate": 1.6354078785125577e-06, "loss": 0.4213, "step": 29910 }, { "epoch": 2.9031632058994763, "grad_norm": 2.2013289800729043, "learning_rate": 1.6191835940035045e-06, "loss": 0.4091, "step": 29920 }, { "epoch": 2.9041335144575973, "grad_norm": 2.1229276793810605, "learning_rate": 1.6029593094944513e-06, "loss": 0.4522, "step": 29930 }, { "epoch": 2.905103823015719, "grad_norm": 2.1647066898307177, "learning_rate": 1.586735024985398e-06, "loss": 0.4248, "step": 29940 }, { "epoch": 2.9060741315738405, "grad_norm": 2.132367635560717, "learning_rate": 1.5705107404763451e-06, "loss": 0.44, "step": 29950 }, { "epoch": 2.907044440131962, "grad_norm": 2.273307314582221, "learning_rate": 1.554286455967292e-06, "loss": 0.4419, "step": 29960 }, { "epoch": 2.9080147486900834, "grad_norm": 2.1262678420504826, "learning_rate": 1.5380621714582387e-06, "loss": 0.4484, "step": 29970 }, { "epoch": 2.9089850572482048, "grad_norm": 2.2020603733056316, "learning_rate": 1.5218378869491855e-06, "loss": 0.4192, "step": 29980 }, { "epoch": 2.9099553658063266, "grad_norm": 2.3228052546578613, "learning_rate": 1.5056136024401325e-06, "loss": 0.544, "step": 29990 }, { "epoch": 2.910925674364448, "grad_norm": 1.7647497970365447, "learning_rate": 1.4893893179310793e-06, "loss": 0.4386, "step": 30000 }, { "epoch": 2.910925674364448, "eval_loss": 0.623051106929779, "eval_runtime": 3072.7615, "eval_samples_per_second": 0.583, "eval_steps_per_second": 0.292, "step": 30000 }, { "epoch": 2.9118959829225695, "grad_norm": 1.8559181997110636, "learning_rate": 1.4731650334220262e-06, "loss": 0.4467, "step": 30010 }, { "epoch": 2.912866291480691, "grad_norm": 1.765879343610588, "learning_rate": 1.456940748912973e-06, "loss": 0.4465, "step": 30020 }, { "epoch": 2.9138366000388123, "grad_norm": 2.1284440857044107, "learning_rate": 1.4407164644039198e-06, "loss": 0.4608, "step": 30030 }, { "epoch": 2.9148069085969337, "grad_norm": 2.1253342038160343, "learning_rate": 1.4244921798948668e-06, "loss": 0.4702, "step": 30040 }, { "epoch": 2.915777217155055, "grad_norm": 2.8231500828592853, "learning_rate": 1.4082678953858136e-06, "loss": 0.4152, "step": 30050 }, { "epoch": 2.916747525713177, "grad_norm": 2.14762659426082, "learning_rate": 1.3920436108767604e-06, "loss": 0.4488, "step": 30060 }, { "epoch": 2.9177178342712984, "grad_norm": 2.238422809039932, "learning_rate": 1.3758193263677072e-06, "loss": 0.4304, "step": 30070 }, { "epoch": 2.91868814282942, "grad_norm": 2.040414777705735, "learning_rate": 1.3595950418586542e-06, "loss": 0.4091, "step": 30080 }, { "epoch": 2.919658451387541, "grad_norm": 2.3708613122486444, "learning_rate": 1.343370757349601e-06, "loss": 0.4167, "step": 30090 }, { "epoch": 2.9206287599456626, "grad_norm": 2.0299308055774827, "learning_rate": 1.3271464728405478e-06, "loss": 0.42, "step": 30100 }, { "epoch": 2.9215990685037845, "grad_norm": 1.5819249876583636, "learning_rate": 1.3109221883314946e-06, "loss": 0.473, "step": 30110 }, { "epoch": 2.9225693770619054, "grad_norm": 2.7499163471346075, "learning_rate": 1.2946979038224414e-06, "loss": 0.4427, "step": 30120 }, { "epoch": 2.9235396856200273, "grad_norm": 2.2578547503079327, "learning_rate": 1.2784736193133885e-06, "loss": 0.4121, "step": 30130 }, { "epoch": 2.9245099941781487, "grad_norm": 1.9631614897936933, "learning_rate": 1.2622493348043353e-06, "loss": 0.4396, "step": 30140 }, { "epoch": 2.92548030273627, "grad_norm": 1.404543931370615, "learning_rate": 1.246025050295282e-06, "loss": 0.4126, "step": 30150 }, { "epoch": 2.9264506112943915, "grad_norm": 1.6868486600814803, "learning_rate": 1.2298007657862289e-06, "loss": 0.4603, "step": 30160 }, { "epoch": 2.927420919852513, "grad_norm": 2.171654518981109, "learning_rate": 1.2135764812771757e-06, "loss": 0.4154, "step": 30170 }, { "epoch": 2.928391228410635, "grad_norm": 2.3436573795705886, "learning_rate": 1.1973521967681227e-06, "loss": 0.4368, "step": 30180 }, { "epoch": 2.9293615369687562, "grad_norm": 1.4978662022608422, "learning_rate": 1.1811279122590695e-06, "loss": 0.3973, "step": 30190 }, { "epoch": 2.9303318455268776, "grad_norm": 2.0381618315849326, "learning_rate": 1.1649036277500161e-06, "loss": 0.4272, "step": 30200 }, { "epoch": 2.931302154084999, "grad_norm": 2.3740316630026146, "learning_rate": 1.1486793432409631e-06, "loss": 0.4545, "step": 30210 }, { "epoch": 2.9322724626431205, "grad_norm": 2.099126790405094, "learning_rate": 1.13245505873191e-06, "loss": 0.4975, "step": 30220 }, { "epoch": 2.933242771201242, "grad_norm": 2.0798837939252666, "learning_rate": 1.116230774222857e-06, "loss": 0.4184, "step": 30230 }, { "epoch": 2.9342130797593633, "grad_norm": 2.3295006076855254, "learning_rate": 1.1000064897138038e-06, "loss": 0.4169, "step": 30240 }, { "epoch": 2.935183388317485, "grad_norm": 2.3900411114763487, "learning_rate": 1.0837822052047503e-06, "loss": 0.4675, "step": 30250 }, { "epoch": 2.9361536968756066, "grad_norm": 2.717638141511491, "learning_rate": 1.0675579206956974e-06, "loss": 0.4423, "step": 30260 }, { "epoch": 2.937124005433728, "grad_norm": 2.6356226803205285, "learning_rate": 1.0513336361866442e-06, "loss": 0.4459, "step": 30270 }, { "epoch": 2.9380943139918494, "grad_norm": 2.3170591461997043, "learning_rate": 1.0351093516775912e-06, "loss": 0.4602, "step": 30280 }, { "epoch": 2.939064622549971, "grad_norm": 2.103941768919782, "learning_rate": 1.018885067168538e-06, "loss": 0.437, "step": 30290 }, { "epoch": 2.940034931108092, "grad_norm": 2.080140592158321, "learning_rate": 1.0026607826594848e-06, "loss": 0.4231, "step": 30300 }, { "epoch": 2.9410052396662136, "grad_norm": 1.8534014560410284, "learning_rate": 9.864364981504316e-07, "loss": 0.4346, "step": 30310 }, { "epoch": 2.9419755482243355, "grad_norm": 1.8144032241447858, "learning_rate": 9.702122136413784e-07, "loss": 0.4343, "step": 30320 }, { "epoch": 2.942945856782457, "grad_norm": 1.9643180746419227, "learning_rate": 9.539879291323254e-07, "loss": 0.4149, "step": 30330 }, { "epoch": 2.9439161653405783, "grad_norm": 2.1260270606096303, "learning_rate": 9.377636446232721e-07, "loss": 0.4097, "step": 30340 }, { "epoch": 2.9448864738986997, "grad_norm": 2.3993733475340906, "learning_rate": 9.21539360114219e-07, "loss": 0.4828, "step": 30350 }, { "epoch": 2.945856782456821, "grad_norm": 1.778314955145708, "learning_rate": 9.053150756051659e-07, "loss": 0.4612, "step": 30360 }, { "epoch": 2.946827091014943, "grad_norm": 2.074977339721577, "learning_rate": 8.890907910961127e-07, "loss": 0.4353, "step": 30370 }, { "epoch": 2.947797399573064, "grad_norm": 2.056320572981397, "learning_rate": 8.728665065870596e-07, "loss": 0.4136, "step": 30380 }, { "epoch": 2.948767708131186, "grad_norm": 2.4400683398792813, "learning_rate": 8.566422220780064e-07, "loss": 0.4765, "step": 30390 }, { "epoch": 2.9497380166893072, "grad_norm": 1.5761154835823354, "learning_rate": 8.404179375689533e-07, "loss": 0.4672, "step": 30400 }, { "epoch": 2.9507083252474287, "grad_norm": 2.1190514650227215, "learning_rate": 8.241936530599001e-07, "loss": 0.4023, "step": 30410 }, { "epoch": 2.95167863380555, "grad_norm": 2.2002378387766335, "learning_rate": 8.079693685508469e-07, "loss": 0.4055, "step": 30420 }, { "epoch": 2.9526489423636715, "grad_norm": 1.8539838422639077, "learning_rate": 7.917450840417938e-07, "loss": 0.4376, "step": 30430 }, { "epoch": 2.9536192509217933, "grad_norm": 2.5797574718759755, "learning_rate": 7.755207995327406e-07, "loss": 0.497, "step": 30440 }, { "epoch": 2.9545895594799148, "grad_norm": 1.8550452979400938, "learning_rate": 7.592965150236874e-07, "loss": 0.4248, "step": 30450 }, { "epoch": 2.955559868038036, "grad_norm": 2.321082121232701, "learning_rate": 7.430722305146343e-07, "loss": 0.4623, "step": 30460 }, { "epoch": 2.9565301765961576, "grad_norm": 2.262813558857042, "learning_rate": 7.268479460055812e-07, "loss": 0.5066, "step": 30470 }, { "epoch": 2.957500485154279, "grad_norm": 2.399426904756141, "learning_rate": 7.10623661496528e-07, "loss": 0.4812, "step": 30480 }, { "epoch": 2.9584707937124004, "grad_norm": 2.3405993527889546, "learning_rate": 6.94399376987475e-07, "loss": 0.4608, "step": 30490 }, { "epoch": 2.959441102270522, "grad_norm": 1.7961186657455892, "learning_rate": 6.781750924784217e-07, "loss": 0.4229, "step": 30500 }, { "epoch": 2.959441102270522, "eval_loss": 0.6230536699295044, "eval_runtime": 3074.076, "eval_samples_per_second": 0.583, "eval_steps_per_second": 0.291, "step": 30500 } ], "logging_steps": 10, "max_steps": 30918, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.2264063227133952e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }