diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -10,3181 +10,3181 @@ "log_history": [ { "epoch": 0.04673552367154274, - "grad_norm": 0.8248543739318848, + "grad_norm": 0.8304148316383362, "learning_rate": 9.375e-06, - "loss": 6.1849, + "loss": 6.185, "step": 1000 }, { "epoch": 0.09347104734308548, - "grad_norm": 1.0290669202804565, + "grad_norm": 1.0305315256118774, "learning_rate": 1.875e-05, - "loss": 4.4262, + "loss": 4.4265, "step": 2000 }, { "epoch": 0.14020657101462822, - "grad_norm": 0.9003722667694092, + "grad_norm": 0.9049100875854492, "learning_rate": 2.8125e-05, - "loss": 4.0814, + "loss": 4.0813, "step": 3000 }, { "epoch": 0.18694209468617096, - "grad_norm": 1.0617480278015137, + "grad_norm": 1.0873961448669434, "learning_rate": 3.75e-05, - "loss": 3.8945, + "loss": 3.894, "step": 4000 }, { "epoch": 0.2336776183577137, - "grad_norm": 0.9939677715301514, + "grad_norm": 1.0131192207336426, "learning_rate": 4.6874999999999994e-05, - "loss": 3.7156, + "loss": 3.7155, "step": 5000 }, { "epoch": 0.28041314202925643, - "grad_norm": 0.9459012746810913, + "grad_norm": 0.9672640562057495, "learning_rate": 5.625e-05, - "loss": 3.5923, + "loss": 3.5928, "step": 6000 }, { "epoch": 0.3271486657007992, - "grad_norm": 1.051654577255249, + "grad_norm": 1.0065984725952148, "learning_rate": 6.5625e-05, - "loss": 3.4775, + "loss": 3.4789, "step": 7000 }, { "epoch": 0.3738841893723419, - "grad_norm": 0.8787893652915955, + "grad_norm": 0.886219322681427, "learning_rate": 7.5e-05, - "loss": 3.3928, + "loss": 3.3945, "step": 8000 }, { "epoch": 0.4206197130438847, - "grad_norm": 0.8886988162994385, + "grad_norm": 0.8790355920791626, "learning_rate": 8.437499999999999e-05, - "loss": 3.3159, + "loss": 3.318, "step": 9000 }, { "epoch": 0.4673552367154274, - "grad_norm": 0.8495904803276062, + "grad_norm": 0.8291410207748413, "learning_rate": 9.374999999999999e-05, - "loss": 3.2375, + "loss": 3.2399, "step": 10000 }, { "epoch": 0.5140907603869701, - "grad_norm": 0.8732250928878784, + "grad_norm": 0.8656781911849976, "learning_rate": 0.00010312499999999999, - "loss": 3.1634, + "loss": 3.1663, "step": 11000 }, { "epoch": 0.5608262840585129, - "grad_norm": 0.8345146775245667, - "learning_rate": 0.00011248124999999999, - "loss": 3.0978, + "grad_norm": 0.8185460567474365, + "learning_rate": 0.0001125, + "loss": 3.1005, "step": 12000 }, { "epoch": 0.6075618077300556, - "grad_norm": 0.8283113241195679, + "grad_norm": 0.8152302503585815, "learning_rate": 0.00012185624999999998, - "loss": 3.0404, + "loss": 3.042, "step": 13000 }, { "epoch": 0.6542973314015984, - "grad_norm": 0.8576833605766296, - "learning_rate": 0.000131221875, - "loss": 2.994, + "grad_norm": 0.9006622433662415, + "learning_rate": 0.00013123125, + "loss": 2.9953, "step": 14000 }, { "epoch": 0.7010328550731411, - "grad_norm": 0.8024506568908691, + "grad_norm": 0.7897035479545593, "learning_rate": 0.000140596875, - "loss": 2.9562, + "loss": 2.9559, "step": 15000 }, { "epoch": 0.7477683787446838, - "grad_norm": 0.7570227384567261, - "learning_rate": 0.0001499625, - "loss": 2.9387, + "grad_norm": 0.7519062161445618, + "learning_rate": 0.000149971875, + "loss": 2.9381, "step": 16000 }, { "epoch": 0.7945039024162266, - "grad_norm": 0.7729004621505737, + "grad_norm": 0.7996582984924316, "learning_rate": 0.00015933749999999996, - "loss": 2.9017, + "loss": 2.9005, "step": 17000 }, { "epoch": 0.8412394260877694, - "grad_norm": 0.7341739535331726, - "learning_rate": 0.00016871249999999996, - "loss": 2.8696, + "grad_norm": 0.7610309720039368, + "learning_rate": 0.000168703125, + "loss": 2.8685, "step": 18000 }, { "epoch": 0.887974949759312, - "grad_norm": 0.6923528909683228, + "grad_norm": 0.6908340454101562, "learning_rate": 0.000178078125, - "loss": 2.8418, + "loss": 2.8399, "step": 19000 }, { "epoch": 0.9347104734308548, - "grad_norm": 0.6515923738479614, - "learning_rate": 0.00018743437499999996, - "loss": 2.8301, + "grad_norm": 0.653693437576294, + "learning_rate": 0.00018744374999999999, + "loss": 2.8288, "step": 20000 }, { "epoch": 0.9814459971023975, - "grad_norm": 0.6980977654457092, - "learning_rate": 0.00019680937499999996, - "loss": 2.8033, + "grad_norm": 0.6934326887130737, + "learning_rate": 0.00019681874999999998, + "loss": 2.8017, "step": 21000 }, { "epoch": 1.0, - "eval_accuracy": 0.4671583335075943, - "eval_loss": 2.906492233276367, - "eval_runtime": 181.9195, - "eval_samples_per_second": 386.572, - "eval_steps_per_second": 6.041, + "eval_accuracy": 0.4674409429609028, + "eval_loss": 2.9042868614196777, + "eval_runtime": 139.2393, + "eval_samples_per_second": 505.066, + "eval_steps_per_second": 7.893, "step": 21397 }, { "epoch": 1.0281815207739402, - "grad_norm": 0.6191843748092651, - "learning_rate": 0.00020618437499999995, - "loss": 2.7769, + "grad_norm": 0.625281810760498, + "learning_rate": 0.00020619374999999998, + "loss": 2.7759, "step": 22000 }, { "epoch": 1.074917044445483, - "grad_norm": 0.6327579021453857, + "grad_norm": 0.6107564568519592, "learning_rate": 0.00021555937499999998, - "loss": 2.7631, + "loss": 2.7621, "step": 23000 }, { "epoch": 1.1216525681170257, - "grad_norm": 0.5806933641433716, + "grad_norm": 0.5980122089385986, "learning_rate": 0.00022493437499999998, - "loss": 2.7601, + "loss": 2.7595, "step": 24000 }, { "epoch": 1.1683880917885685, - "grad_norm": 0.6323985457420349, + "grad_norm": 0.6484950184822083, "learning_rate": 0.00023429999999999998, - "loss": 2.7296, + "loss": 2.7285, "step": 25000 }, { "epoch": 1.2151236154601113, - "grad_norm": 0.5768846869468689, + "grad_norm": 0.5739120244979858, "learning_rate": 0.00024367499999999997, - "loss": 2.7258, + "loss": 2.7249, "step": 26000 }, { "epoch": 1.261859139131654, - "grad_norm": 0.5394349694252014, + "grad_norm": 0.5384770035743713, "learning_rate": 0.00025305, - "loss": 2.7186, + "loss": 2.7174, "step": 27000 }, { "epoch": 1.3085946628031966, - "grad_norm": 0.5966582298278809, + "grad_norm": 0.6061840653419495, "learning_rate": 0.000262415625, - "loss": 2.7106, + "loss": 2.7096, "step": 28000 }, { "epoch": 1.3553301864747396, - "grad_norm": 0.5289971232414246, + "grad_norm": 0.5460996627807617, "learning_rate": 0.000271790625, - "loss": 2.7075, + "loss": 2.7064, "step": 29000 }, { "epoch": 1.4020657101462821, - "grad_norm": 0.5028895139694214, + "grad_norm": 0.48677995800971985, "learning_rate": 0.00028115624999999994, - "loss": 2.6847, + "loss": 2.6832, "step": 30000 }, { "epoch": 1.4488012338178249, - "grad_norm": 0.46625378727912903, - "learning_rate": 0.00029053124999999994, - "loss": 2.6701, + "grad_norm": 0.4752153754234314, + "learning_rate": 0.000290521875, + "loss": 2.6692, "step": 31000 }, { "epoch": 1.4955367574893677, - "grad_norm": 0.47429364919662476, - "learning_rate": 0.00029990624999999993, - "loss": 2.6642, + "grad_norm": 0.4777120053768158, + "learning_rate": 0.000299896875, + "loss": 2.6635, "step": 32000 }, { "epoch": 1.5422722811609104, - "grad_norm": 0.4630049765110016, - "learning_rate": 0.0002992498863464161, - "loss": 2.6493, + "grad_norm": 0.4603022336959839, + "learning_rate": 0.00029925064403697526, + "loss": 2.6485, "step": 33000 }, { "epoch": 1.5890078048324532, - "grad_norm": 0.4727250933647156, + "grad_norm": 0.4725264608860016, "learning_rate": 0.0002984929534777996, - "loss": 2.6392, + "loss": 2.6377, "step": 34000 }, { "epoch": 1.6357433285039957, - "grad_norm": 0.42802268266677856, - "learning_rate": 0.000297735262918624, - "loss": 2.6413, + "grad_norm": 0.4137626588344574, + "learning_rate": 0.0002977360206091832, + "loss": 2.6406, "step": 35000 }, { "epoch": 1.6824788521755387, - "grad_norm": 0.46135851740837097, + "grad_norm": 0.46301260590553284, "learning_rate": 0.00029697833005000755, - "loss": 2.6263, + "loss": 2.6253, "step": 36000 }, { "epoch": 1.7292143758470813, - "grad_norm": 0.41609084606170654, - "learning_rate": 0.0002962206394908319, - "loss": 2.6188, + "grad_norm": 0.42351824045181274, + "learning_rate": 0.0002962213971813911, + "loss": 2.6182, "step": 37000 }, { "epoch": 1.7759498995186243, - "grad_norm": 0.4131929278373718, - "learning_rate": 0.00029546294893165626, - "loss": 2.6028, + "grad_norm": 0.407830148935318, + "learning_rate": 0.0002954637066222155, + "loss": 2.6019, "step": 38000 }, { "epoch": 1.8226854231901668, - "grad_norm": 0.42282310128211975, + "grad_norm": 0.4228152334690094, "learning_rate": 0.00029470601606303983, - "loss": 2.5922, + "loss": 2.5915, "step": 39000 }, { "epoch": 1.8694209468617096, - "grad_norm": 0.37517085671424866, - "learning_rate": 0.0002939483255038642, - "loss": 2.5966, + "grad_norm": 0.3772471845149994, + "learning_rate": 0.0002939490831944234, + "loss": 2.5955, "step": 40000 }, { "epoch": 1.9161564705332523, - "grad_norm": 0.3877894878387451, + "grad_norm": 0.3977556526660919, "learning_rate": 0.00029319139263524777, - "loss": 2.5769, + "loss": 2.5762, "step": 41000 }, { "epoch": 1.962891994204795, - "grad_norm": 0.3990887999534607, - "learning_rate": 0.0002924344597666313, - "loss": 2.57, + "grad_norm": 0.3756692707538605, + "learning_rate": 0.0002924337020760721, + "loss": 2.5697, "step": 42000 }, { "epoch": 2.0, - "eval_accuracy": 0.4900495319350634, - "eval_loss": 2.6932899951934814, - "eval_runtime": 184.3808, - "eval_samples_per_second": 381.412, - "eval_steps_per_second": 5.96, + "eval_accuracy": 0.49032617469312645, + "eval_loss": 2.691434383392334, + "eval_runtime": 139.6669, + "eval_samples_per_second": 503.519, + "eval_steps_per_second": 7.869, "step": 42794 }, { "epoch": 2.009627517876338, - "grad_norm": 0.3544236421585083, + "grad_norm": 0.35327184200286865, "learning_rate": 0.00029167676920745564, - "loss": 2.5606, + "loss": 2.5599, "step": 43000 }, { "epoch": 2.0563630415478804, - "grad_norm": 0.39064693450927734, + "grad_norm": 0.394180566072464, "learning_rate": 0.00029091907864828, - "loss": 2.5214, + "loss": 2.5206, "step": 44000 }, { "epoch": 2.1030985652194234, - "grad_norm": 0.39446505904197693, - "learning_rate": 0.0002901613880891044, - "loss": 2.5125, + "grad_norm": 0.39306408166885376, + "learning_rate": 0.0002901621457796636, + "loss": 2.5118, "step": 45000 }, { "epoch": 2.149834088890966, - "grad_norm": 0.4003261923789978, - "learning_rate": 0.00028940369752992876, - "loss": 2.5217, + "grad_norm": 0.3899837136268616, + "learning_rate": 0.00028940445522048793, + "loss": 2.5213, "step": 46000 }, { "epoch": 2.196569612562509, - "grad_norm": 0.38740643858909607, - "learning_rate": 0.0002886467646613123, - "loss": 2.5115, + "grad_norm": 0.38731372356414795, + "learning_rate": 0.00028864752235187145, + "loss": 2.511, "step": 47000 }, { "epoch": 2.2433051362340515, - "grad_norm": 0.36951524019241333, - "learning_rate": 0.0002878898317926958, - "loss": 2.5228, + "grad_norm": 0.3677964210510254, + "learning_rate": 0.000287890589483255, + "loss": 2.522, "step": 48000 }, { "epoch": 2.2900406599055945, - "grad_norm": 0.3615242540836334, - "learning_rate": 0.0002871321412335202, - "loss": 2.5062, + "grad_norm": 0.36402180790901184, + "learning_rate": 0.0002871328989240794, + "loss": 2.5053, "step": 49000 }, { "epoch": 2.336776183577137, - "grad_norm": 0.3720042407512665, - "learning_rate": 0.00028637445067434457, - "loss": 2.4979, + "grad_norm": 0.37511104345321655, + "learning_rate": 0.00028637520836490374, + "loss": 2.4976, "step": 50000 }, { "epoch": 2.3835117072486796, - "grad_norm": 0.3349687457084656, + "grad_norm": 0.34363070130348206, "learning_rate": 0.0002856175178057281, - "loss": 2.5054, + "loss": 2.5044, "step": 51000 }, { "epoch": 2.4302472309202225, - "grad_norm": 0.354491263628006, - "learning_rate": 0.0002848598272465525, - "loss": 2.4955, + "grad_norm": 0.3448020815849304, + "learning_rate": 0.00028486058493711167, + "loss": 2.4946, "step": 52000 }, { "epoch": 2.476982754591765, - "grad_norm": 0.36612212657928467, - "learning_rate": 0.00028410213668737686, - "loss": 2.4862, + "grad_norm": 0.35907140374183655, + "learning_rate": 0.000284102894377936, + "loss": 2.4859, "step": 53000 }, { "epoch": 2.523718278263308, - "grad_norm": 0.364339679479599, - "learning_rate": 0.0002833444461282012, - "loss": 2.4889, + "grad_norm": 0.3643428385257721, + "learning_rate": 0.0002833452038187604, + "loss": 2.4881, "step": 54000 }, { "epoch": 2.5704538019348506, - "grad_norm": 0.34728386998176575, - "learning_rate": 0.0002825875132595848, - "loss": 2.4912, + "grad_norm": 0.36020660400390625, + "learning_rate": 0.00028258827095014395, + "loss": 2.491, "step": 55000 }, { "epoch": 2.617189325606393, - "grad_norm": 0.3255521059036255, - "learning_rate": 0.00028182982270040914, - "loss": 2.48, + "grad_norm": 0.3306775987148285, + "learning_rate": 0.0002818305803909683, + "loss": 2.4793, "step": 56000 }, { "epoch": 2.663924849277936, - "grad_norm": 0.36621731519699097, - "learning_rate": 0.0002810721321412335, - "loss": 2.4846, + "grad_norm": 0.36928820610046387, + "learning_rate": 0.00028107288983179266, + "loss": 2.4838, "step": 57000 }, { "epoch": 2.710660372949479, - "grad_norm": 0.37520626187324524, + "grad_norm": 0.3665056526660919, "learning_rate": 0.00028031519927261707, - "loss": 2.4712, + "loss": 2.47, "step": 58000 }, { "epoch": 2.7573958966210217, - "grad_norm": 0.3485676348209381, + "grad_norm": 0.3365645408630371, "learning_rate": 0.0002795582664040006, - "loss": 2.477, + "loss": 2.4762, "step": 59000 }, { "epoch": 2.8041314202925642, - "grad_norm": 0.33132848143577576, + "grad_norm": 0.3240434229373932, "learning_rate": 0.00027880057584482495, - "loss": 2.479, + "loss": 2.4783, "step": 60000 }, { "epoch": 2.8508669439641072, - "grad_norm": 0.3298667371273041, - "learning_rate": 0.0002780428852856493, - "loss": 2.4676, + "grad_norm": 0.3358655571937561, + "learning_rate": 0.00027804364297620847, + "loss": 2.4671, "step": 61000 }, { "epoch": 2.8976024676356498, - "grad_norm": 0.3598766624927521, - "learning_rate": 0.0002772851947264737, - "loss": 2.4698, + "grad_norm": 0.3612423837184906, + "learning_rate": 0.0002772859524170328, + "loss": 2.4694, "step": 62000 }, { "epoch": 2.9443379913071928, - "grad_norm": 0.34787800908088684, + "grad_norm": 0.3500802516937256, "learning_rate": 0.00027652826185785724, - "loss": 2.451, + "loss": 2.4508, "step": 63000 }, { "epoch": 2.9910735149787353, - "grad_norm": 0.3385255038738251, - "learning_rate": 0.0002757705712986816, - "loss": 2.4599, + "grad_norm": 0.3532866835594177, + "learning_rate": 0.0002757720866797999, + "loss": 2.4593, "step": 64000 }, { "epoch": 3.0, - "eval_accuracy": 0.5007376675519123, - "eval_loss": 2.6011228561401367, - "eval_runtime": 183.7613, - "eval_samples_per_second": 382.697, - "eval_steps_per_second": 5.981, + "eval_accuracy": 0.5008982106884702, + "eval_loss": 2.599777936935425, + "eval_runtime": 139.9382, + "eval_samples_per_second": 502.543, + "eval_steps_per_second": 7.853, "step": 64191 }, { "epoch": 3.0378090386502783, - "grad_norm": 0.33664873242378235, - "learning_rate": 0.0002750136384300651, - "loss": 2.4104, + "grad_norm": 0.3479374945163727, + "learning_rate": 0.00027501439612062433, + "loss": 2.4102, "step": 65000 }, { "epoch": 3.084544562321821, - "grad_norm": 0.3183669149875641, - "learning_rate": 0.00027425594787088947, - "loss": 2.4087, + "grad_norm": 0.3217349052429199, + "learning_rate": 0.00027425746325200785, + "loss": 2.4086, "step": 66000 }, { "epoch": 3.1312800859933634, - "grad_norm": 0.3450115919113159, - "learning_rate": 0.00027349901500227304, - "loss": 2.4053, + "grad_norm": 0.33104005455970764, + "learning_rate": 0.0002734997726928322, + "loss": 2.4046, "step": 67000 }, { "epoch": 3.1780156096649064, - "grad_norm": 0.3532576560974121, - "learning_rate": 0.0002727413244430974, - "loss": 2.405, + "grad_norm": 0.3521591126918793, + "learning_rate": 0.0002727420821336566, + "loss": 2.4047, "step": 68000 }, { "epoch": 3.224751133336449, - "grad_norm": 0.356446772813797, - "learning_rate": 0.00027198363388392175, - "loss": 2.4053, + "grad_norm": 0.3443906009197235, + "learning_rate": 0.000271984391574481, + "loss": 2.4051, "step": 69000 }, { "epoch": 3.271486657007992, - "grad_norm": 0.30622342228889465, + "grad_norm": 0.30954253673553467, "learning_rate": 0.00027122670101530533, - "loss": 2.4025, + "loss": 2.4017, "step": 70000 }, { "epoch": 3.3182221806795344, - "grad_norm": 0.3361418545246124, + "grad_norm": 0.3274224102497101, "learning_rate": 0.0002704690104561297, - "loss": 2.4051, + "loss": 2.4047, "step": 71000 }, { "epoch": 3.3649577043510774, - "grad_norm": 0.31644776463508606, - "learning_rate": 0.00026971131989695404, - "loss": 2.4087, + "grad_norm": 0.31764140725135803, + "learning_rate": 0.00026971207758751326, + "loss": 2.4082, "step": 72000 }, { "epoch": 3.41169322802262, - "grad_norm": 0.35094141960144043, + "grad_norm": 0.36559322476387024, "learning_rate": 0.0002689543870283376, - "loss": 2.3977, + "loss": 2.3968, "step": 73000 }, { "epoch": 3.4584287516941625, - "grad_norm": 0.36921316385269165, + "grad_norm": 0.366269052028656, "learning_rate": 0.00026819669646916197, - "loss": 2.4109, + "loss": 2.4099, "step": 74000 }, { "epoch": 3.5051642753657055, - "grad_norm": 0.32217535376548767, - "learning_rate": 0.00026743976360054555, - "loss": 2.4036, + "grad_norm": 0.32572275400161743, + "learning_rate": 0.0002674390059099863, + "loss": 2.4031, "step": 75000 }, { "epoch": 3.551899799037248, - "grad_norm": 0.34006989002227783, - "learning_rate": 0.00026668283073192907, - "loss": 2.4057, + "grad_norm": 0.3399852514266968, + "learning_rate": 0.0002666820730413699, + "loss": 2.4052, "step": 76000 }, { "epoch": 3.598635322708791, - "grad_norm": 0.3229130506515503, - "learning_rate": 0.0002659251401727534, - "loss": 2.4056, + "grad_norm": 0.32422304153442383, + "learning_rate": 0.00026592438248219426, + "loss": 2.405, "step": 77000 }, { "epoch": 3.6453708463803336, - "grad_norm": 0.3431253433227539, - "learning_rate": 0.0002651674496135778, + "grad_norm": 0.3334662616252899, + "learning_rate": 0.0002651666919230186, "loss": 2.397, "step": 78000 }, { "epoch": 3.6921063700518766, - "grad_norm": 0.3360424041748047, + "grad_norm": 0.3444022834300995, "learning_rate": 0.00026440975905440213, - "loss": 2.3908, + "loss": 2.3898, "step": 79000 }, { "epoch": 3.738841893723419, - "grad_norm": 0.30646803975105286, + "grad_norm": 0.29835402965545654, "learning_rate": 0.00026365206849522654, - "loss": 2.3939, + "loss": 2.3935, "step": 80000 }, { "epoch": 3.785577417394962, - "grad_norm": 0.35814180970191956, + "grad_norm": 0.34723761677742004, "learning_rate": 0.0002628943779360509, - "loss": 2.3977, + "loss": 2.3974, "step": 81000 }, { "epoch": 3.8323129410665047, - "grad_norm": 0.3155287206172943, - "learning_rate": 0.0002621382027579936, - "loss": 2.3997, + "grad_norm": 0.3228047788143158, + "learning_rate": 0.0002621374450674344, + "loss": 2.399, "step": 82000 }, { "epoch": 3.879048464738047, - "grad_norm": 0.33178508281707764, - "learning_rate": 0.000261380512198818, - "loss": 2.3908, + "grad_norm": 0.32759812474250793, + "learning_rate": 0.0002613797545082588, + "loss": 2.3903, "step": 83000 }, { "epoch": 3.92578398840959, - "grad_norm": 0.32515403628349304, - "learning_rate": 0.00026062282163964235, - "loss": 2.3905, + "grad_norm": 0.3363027572631836, + "learning_rate": 0.0002606220639490832, + "loss": 2.3898, "step": 84000 }, { "epoch": 3.9725195120811327, - "grad_norm": 0.3208604156970978, - "learning_rate": 0.00025986588877102587, - "loss": 2.3967, + "grad_norm": 0.3208337128162384, + "learning_rate": 0.0002598651310804667, + "loss": 2.3962, "step": 85000 }, { "epoch": 4.0, - "eval_accuracy": 0.5063414538940354, - "eval_loss": 2.5534298419952393, - "eval_runtime": 183.4478, - "eval_samples_per_second": 383.352, - "eval_steps_per_second": 5.991, + "eval_accuracy": 0.5062499459791027, + "eval_loss": 2.5531787872314453, + "eval_runtime": 139.772, + "eval_samples_per_second": 503.141, + "eval_steps_per_second": 7.863, "step": 85588 }, { "epoch": 4.019255035752676, - "grad_norm": 0.3112667202949524, - "learning_rate": 0.0002591081982118503, - "loss": 2.3708, + "grad_norm": 0.29504185914993286, + "learning_rate": 0.00025910744052129106, + "loss": 2.3702, "step": 86000 }, { "epoch": 4.065990559424218, - "grad_norm": 0.33806201815605164, + "grad_norm": 0.3328858017921448, "learning_rate": 0.00025835050765267464, - "loss": 2.3319, + "loss": 2.3313, "step": 87000 }, { "epoch": 4.112726083095761, - "grad_norm": 0.33769309520721436, - "learning_rate": 0.00025759357478405816, - "loss": 2.3438, + "grad_norm": 0.33443954586982727, + "learning_rate": 0.000257592817093499, + "loss": 2.3432, "step": 88000 }, { "epoch": 4.159461606767304, - "grad_norm": 0.3399094343185425, - "learning_rate": 0.00025683664191544173, - "loss": 2.3478, + "grad_norm": 0.34589192271232605, + "learning_rate": 0.00025683512653432335, + "loss": 2.3473, "step": 89000 }, { "epoch": 4.206197130438847, - "grad_norm": 0.3605833053588867, - "learning_rate": 0.0002560789513562661, - "loss": 2.3403, + "grad_norm": 0.363839715719223, + "learning_rate": 0.0002560781936657069, + "loss": 2.3396, "step": 90000 }, { "epoch": 4.252932654110389, - "grad_norm": 0.35199499130249023, + "grad_norm": 0.3480285406112671, "learning_rate": 0.00025532126079709044, - "loss": 2.3451, + "loss": 2.3444, "step": 91000 }, { "epoch": 4.299668177781932, - "grad_norm": 0.3519517183303833, + "grad_norm": 0.3445780277252197, "learning_rate": 0.00025456357023791485, "loss": 2.3433, "step": 92000 }, { "epoch": 4.346403701453474, - "grad_norm": 0.3213213086128235, + "grad_norm": 0.3211309313774109, "learning_rate": 0.0002538058796787392, - "loss": 2.3399, + "loss": 2.3394, "step": 93000 }, { "epoch": 4.393139225125018, - "grad_norm": 0.3580070436000824, - "learning_rate": 0.00025304894681012273, - "loss": 2.3463, + "grad_norm": 0.34270036220550537, + "learning_rate": 0.00025304818911956356, + "loss": 2.346, "step": 94000 }, { "epoch": 4.43987474879656, - "grad_norm": 0.338870108127594, + "grad_norm": 0.339600145816803, "learning_rate": 0.0002522912562509471, - "loss": 2.3476, + "loss": 2.347, "step": 95000 }, { "epoch": 4.486610272468103, - "grad_norm": 0.33986231684684753, - "learning_rate": 0.0002515343233823306, - "loss": 2.3448, + "grad_norm": 0.3255089819431305, + "learning_rate": 0.00025153356569177144, + "loss": 2.3444, "step": 96000 }, { "epoch": 4.5333457961396455, - "grad_norm": 0.33853477239608765, - "learning_rate": 0.000250776632823155, - "loss": 2.3463, + "grad_norm": 0.3371700942516327, + "learning_rate": 0.00025077587513259585, + "loss": 2.3459, "step": 97000 }, { "epoch": 4.580081319811189, - "grad_norm": 0.3056027293205261, - "learning_rate": 0.00025001894226397937, - "loss": 2.3439, + "grad_norm": 0.30787476897239685, + "learning_rate": 0.0002500181845734202, + "loss": 2.3433, "step": 98000 }, { "epoch": 4.6268168434827315, - "grad_norm": 0.330951064825058, - "learning_rate": 0.0002492620093953629, - "loss": 2.3468, + "grad_norm": 0.3386484384536743, + "learning_rate": 0.00024926049401424456, + "loss": 2.3461, "step": 99000 }, { "epoch": 4.673552367154274, - "grad_norm": 0.32781165838241577, - "learning_rate": 0.00024850431883618725, - "loss": 2.3426, + "grad_norm": 0.317842572927475, + "learning_rate": 0.00024850280345506897, + "loss": 2.3417, "step": 100000 }, { "epoch": 4.720287890825817, - "grad_norm": 0.326337993144989, - "learning_rate": 0.0002477473859675708, - "loss": 2.3385, + "grad_norm": 0.3175218403339386, + "learning_rate": 0.0002477458705864525, + "loss": 2.3381, "step": 101000 }, { "epoch": 4.767023414497359, - "grad_norm": 0.30979540944099426, - "learning_rate": 0.0002469896954083952, - "loss": 2.3517, + "grad_norm": 0.3106536865234375, + "learning_rate": 0.000246988937717836, + "loss": 2.3513, "step": 102000 }, { "epoch": 4.8137589381689025, - "grad_norm": 0.3351231813430786, + "grad_norm": 0.3228330612182617, "learning_rate": 0.00024623200484921953, - "loss": 2.3306, + "loss": 2.3298, "step": 103000 }, { "epoch": 4.860494461840445, - "grad_norm": 0.3361092209815979, - "learning_rate": 0.0002454750719806031, - "loss": 2.3406, + "grad_norm": 0.3342211842536926, + "learning_rate": 0.00024547431429004394, + "loss": 2.3399, "step": 104000 }, { "epoch": 4.907229985511988, - "grad_norm": 0.36400631070137024, - "learning_rate": 0.00024471738142142746, - "loss": 2.345, + "grad_norm": 0.357527494430542, + "learning_rate": 0.0002447166237308683, + "loss": 2.3446, "step": 105000 }, { "epoch": 4.95396550918353, - "grad_norm": 0.3127535581588745, - "learning_rate": 0.000243960448552811, - "loss": 2.3376, + "grad_norm": 0.3023281991481781, + "learning_rate": 0.00024395893317169265, + "loss": 2.3371, "step": 106000 }, { "epoch": 5.0, - "eval_accuracy": 0.5100188341244781, - "eval_loss": 2.524883985519409, - "eval_runtime": 183.6153, - "eval_samples_per_second": 383.002, - "eval_steps_per_second": 5.985, + "eval_accuracy": 0.5100384071154235, + "eval_loss": 2.5247113704681396, + "eval_runtime": 140.0479, + "eval_samples_per_second": 502.15, + "eval_steps_per_second": 7.847, "step": 106985 }, { "epoch": 5.000701032855073, - "grad_norm": 0.3345338702201843, - "learning_rate": 0.00024320275799363537, - "loss": 2.3364, + "grad_norm": 0.33317577838897705, + "learning_rate": 0.00024320200030307623, + "loss": 2.3362, "step": 107000 }, { "epoch": 5.047436556526616, - "grad_norm": 0.32806724309921265, - "learning_rate": 0.00024244506743445975, - "loss": 2.2779, + "grad_norm": 0.32777294516563416, + "learning_rate": 0.00024244430974390058, + "loss": 2.2772, "step": 108000 }, { "epoch": 5.094172080198159, - "grad_norm": 0.3400112986564636, - "learning_rate": 0.0002416881345658433, + "grad_norm": 0.3370286822319031, + "learning_rate": 0.0002416873768752841, "loss": 2.287, "step": 109000 }, { "epoch": 5.140907603869701, - "grad_norm": 0.2923315763473511, - "learning_rate": 0.00024093120169722682, - "loss": 2.2887, + "grad_norm": 0.29575446248054504, + "learning_rate": 0.0002409296863161085, + "loss": 2.2884, "step": 110000 }, { "epoch": 5.187643127541244, - "grad_norm": 0.36175695061683655, - "learning_rate": 0.0002401735111380512, - "loss": 2.2956, + "grad_norm": 0.375265508890152, + "learning_rate": 0.00024017199575693284, + "loss": 2.2953, "step": 111000 }, { "epoch": 5.234378651212787, - "grad_norm": 0.32664230465888977, - "learning_rate": 0.00023941582057887558, - "loss": 2.2987, + "grad_norm": 0.33020052313804626, + "learning_rate": 0.0002394150628883164, + "loss": 2.2981, "step": 112000 }, { "epoch": 5.28111417488433, - "grad_norm": 0.3379104733467102, - "learning_rate": 0.00023865813001969994, - "loss": 2.2949, + "grad_norm": 0.33341318368911743, + "learning_rate": 0.00023865737232914077, + "loss": 2.2945, "step": 113000 }, { "epoch": 5.327849698555872, - "grad_norm": 0.3442465364933014, - "learning_rate": 0.0002379011971510835, - "loss": 2.2926, + "grad_norm": 0.34047931432724, + "learning_rate": 0.00023789968176996513, + "loss": 2.2922, "step": 114000 }, { "epoch": 5.374585222227415, - "grad_norm": 0.3272307217121124, - "learning_rate": 0.00023714350659190784, - "loss": 2.291, + "grad_norm": 0.3312656879425049, + "learning_rate": 0.00023714274890134865, + "loss": 2.2911, "step": 115000 }, { "epoch": 5.421320745898957, - "grad_norm": 0.33245283365249634, - "learning_rate": 0.0002363858160327322, - "loss": 2.304, + "grad_norm": 0.34014657139778137, + "learning_rate": 0.00023638505834217306, + "loss": 2.3037, "step": 116000 }, { "epoch": 5.468056269570501, - "grad_norm": 0.3197270333766937, - "learning_rate": 0.00023562812547355658, - "loss": 2.3024, + "grad_norm": 0.3288027346134186, + "learning_rate": 0.00023562736778299742, + "loss": 2.3023, "step": 117000 }, { "epoch": 5.514791793242043, - "grad_norm": 0.32617905735969543, - "learning_rate": 0.00023487119260494013, - "loss": 2.3062, + "grad_norm": 0.3191498816013336, + "learning_rate": 0.00023487043491438094, + "loss": 2.3057, "step": 118000 }, { "epoch": 5.561527316913586, - "grad_norm": 0.3404013514518738, - "learning_rate": 0.00023411425973632365, - "loss": 2.3055, + "grad_norm": 0.3255312740802765, + "learning_rate": 0.00023411274435520532, + "loss": 2.3051, "step": 119000 }, { "epoch": 5.6082628405851285, - "grad_norm": 0.35346364974975586, - "learning_rate": 0.00023335656917714803, - "loss": 2.3029, + "grad_norm": 0.3476366400718689, + "learning_rate": 0.00023335581148658884, + "loss": 2.3021, "step": 120000 }, { "epoch": 5.654998364256672, - "grad_norm": 0.33657070994377136, - "learning_rate": 0.00023259963630853155, - "loss": 2.3001, + "grad_norm": 0.35009336471557617, + "learning_rate": 0.00023259812092741322, + "loss": 2.2996, "step": 121000 }, { "epoch": 5.7017338879282145, - "grad_norm": 0.3316746950149536, - "learning_rate": 0.00023184194574935594, - "loss": 2.2941, + "grad_norm": 0.3336103558540344, + "learning_rate": 0.0002318404303682376, + "loss": 2.2936, "step": 122000 }, { "epoch": 5.748469411599757, - "grad_norm": 0.2895432114601135, - "learning_rate": 0.00023108425519018032, - "loss": 2.3032, + "grad_norm": 0.29048165678977966, + "learning_rate": 0.00023108349749962113, + "loss": 2.3027, "step": 123000 }, { "epoch": 5.7952049352712995, - "grad_norm": 0.3094755709171295, - "learning_rate": 0.00023032656463100467, - "loss": 2.3081, + "grad_norm": 0.3026094138622284, + "learning_rate": 0.00023032580694044548, + "loss": 2.308, "step": 124000 }, { "epoch": 5.841940458942842, - "grad_norm": 0.3222903311252594, - "learning_rate": 0.00022956887407182906, - "loss": 2.3091, + "grad_norm": 0.3187408447265625, + "learning_rate": 0.0002295681163812699, + "loss": 2.3089, "step": 125000 }, { "epoch": 5.8886759826143855, - "grad_norm": 0.34477153420448303, - "learning_rate": 0.0002288111835126534, - "loss": 2.2979, + "grad_norm": 0.3474861681461334, + "learning_rate": 0.00022881042582209425, + "loss": 2.2971, "step": 126000 }, { "epoch": 5.935411506285928, - "grad_norm": 0.31651487946510315, - "learning_rate": 0.00022805500833459613, - "loss": 2.304, + "grad_norm": 0.3091016113758087, + "learning_rate": 0.00022805349295347777, + "loss": 2.3034, "step": 127000 }, { "epoch": 5.982147029957471, - "grad_norm": 0.32441022992134094, - "learning_rate": 0.00022729731777542048, - "loss": 2.3036, + "grad_norm": 0.32493680715560913, + "learning_rate": 0.00022729656008486132, + "loss": 2.3029, "step": 128000 }, { "epoch": 6.0, - "eval_accuracy": 0.5122081094080007, - "eval_loss": 2.509770154953003, - "eval_runtime": 183.5984, - "eval_samples_per_second": 383.037, - "eval_steps_per_second": 5.986, + "eval_accuracy": 0.5121493346716575, + "eval_loss": 2.5101099014282227, + "eval_runtime": 139.7071, + "eval_samples_per_second": 503.375, + "eval_steps_per_second": 7.866, "step": 128382 }, { "epoch": 6.028882553629013, - "grad_norm": 0.339528352022171, + "grad_norm": 0.3480684459209442, "learning_rate": 0.0002265396272162449, - "loss": 2.2707, + "loss": 2.2705, "step": 129000 }, { "epoch": 6.075618077300557, - "grad_norm": 0.33392465114593506, + "grad_norm": 0.3242765963077545, "learning_rate": 0.00022578193665706925, - "loss": 2.2395, + "loss": 2.2394, "step": 130000 }, { "epoch": 6.122353600972099, - "grad_norm": 0.3576413094997406, - "learning_rate": 0.00022502500378845277, - "loss": 2.2522, + "grad_norm": 0.38513755798339844, + "learning_rate": 0.0002250242460978936, + "loss": 2.2516, "step": 131000 }, { "epoch": 6.169089124643642, - "grad_norm": 0.370988667011261, - "learning_rate": 0.00022426807091983632, - "loss": 2.2593, + "grad_norm": 0.34748023748397827, + "learning_rate": 0.00022426655553871796, + "loss": 2.2587, "step": 132000 }, { "epoch": 6.215824648315184, - "grad_norm": 0.3223007619380951, - "learning_rate": 0.00022351038036066067, - "loss": 2.253, + "grad_norm": 0.40733420848846436, + "learning_rate": 0.0002235088649795423, + "loss": 2.2525, "step": 133000 }, { "epoch": 6.262560171986727, - "grad_norm": 0.3452748954296112, - "learning_rate": 0.00022275268980148503, - "loss": 2.2628, + "grad_norm": 0.34682250022888184, + "learning_rate": 0.0002227519321109259, + "loss": 2.2625, "step": 134000 }, { "epoch": 6.30929569565827, - "grad_norm": 0.33391740918159485, - "learning_rate": 0.0002219957569328686, - "loss": 2.26, + "grad_norm": 0.31701159477233887, + "learning_rate": 0.00022199424155175024, + "loss": 2.2596, "step": 135000 }, { "epoch": 6.356031219329813, - "grad_norm": 0.32963138818740845, - "learning_rate": 0.00022123806637369296, - "loss": 2.2612, + "grad_norm": 0.3319772481918335, + "learning_rate": 0.0002212373086831338, + "loss": 2.2606, "step": 136000 }, { "epoch": 6.402766743001355, - "grad_norm": 0.33252349495887756, - "learning_rate": 0.0002204803758145173, - "loss": 2.2659, + "grad_norm": 0.3415438234806061, + "learning_rate": 0.00022047961812395815, + "loss": 2.2661, "step": 137000 }, { "epoch": 6.449502266672898, - "grad_norm": 0.3672574460506439, - "learning_rate": 0.00021972344294590086, - "loss": 2.2692, + "grad_norm": 0.37148797512054443, + "learning_rate": 0.00021972192756478253, + "loss": 2.2691, "step": 138000 }, { "epoch": 6.496237790344441, - "grad_norm": 0.36064982414245605, - "learning_rate": 0.00021896575238672524, - "loss": 2.261, + "grad_norm": 0.3590894341468811, + "learning_rate": 0.00021896423700560688, + "loss": 2.2605, "step": 139000 }, { "epoch": 6.542973314015984, - "grad_norm": 0.3338433802127838, - "learning_rate": 0.0002182080618275496, - "loss": 2.2741, + "grad_norm": 0.35641875863075256, + "learning_rate": 0.00021820654644643127, + "loss": 2.2734, "step": 140000 }, { "epoch": 6.589708837687526, - "grad_norm": 0.36847883462905884, - "learning_rate": 0.00021745112895893315, - "loss": 2.2643, + "grad_norm": 0.3251437842845917, + "learning_rate": 0.0002174496135778148, + "loss": 2.2634, "step": 141000 }, { "epoch": 6.636444361359069, - "grad_norm": 0.33524858951568604, - "learning_rate": 0.0002166941960903167, - "loss": 2.2804, + "grad_norm": 0.33028358221054077, + "learning_rate": 0.00021669192301863917, + "loss": 2.2798, "step": 142000 }, { "epoch": 6.6831798850306114, - "grad_norm": 0.3156317174434662, - "learning_rate": 0.00021593650553114108, - "loss": 2.2583, + "grad_norm": 0.32118934392929077, + "learning_rate": 0.00021593499015002272, + "loss": 2.2579, "step": 143000 }, { "epoch": 6.729915408702155, - "grad_norm": 0.3375246226787567, - "learning_rate": 0.00021517881497196543, - "loss": 2.2719, + "grad_norm": 0.3405369222164154, + "learning_rate": 0.00021517729959084707, + "loss": 2.2711, "step": 144000 }, { "epoch": 6.776650932373697, - "grad_norm": 0.29037168622016907, - "learning_rate": 0.0002144211244127898, - "loss": 2.2612, + "grad_norm": 0.2833816111087799, + "learning_rate": 0.00021442036672223062, + "loss": 2.2609, "step": 145000 }, { "epoch": 6.82338645604524, - "grad_norm": 0.37422460317611694, - "learning_rate": 0.00021366419154417334, - "loss": 2.2678, + "grad_norm": 0.3548373281955719, + "learning_rate": 0.00021366267616305498, + "loss": 2.2673, "step": 146000 }, { "epoch": 6.8701219797167825, - "grad_norm": 0.3063635230064392, - "learning_rate": 0.00021290650098499772, - "loss": 2.2716, + "grad_norm": 0.32530516386032104, + "learning_rate": 0.00021290498560387936, + "loss": 2.2713, "step": 147000 }, { "epoch": 6.916857503388325, - "grad_norm": 0.3298608660697937, - "learning_rate": 0.00021214881042582208, - "loss": 2.274, + "grad_norm": 0.3257991373538971, + "learning_rate": 0.00021214729504470372, + "loss": 2.2733, "step": 148000 }, { "epoch": 6.9635930270598685, - "grad_norm": 0.342487096786499, - "learning_rate": 0.00021139111986664643, - "loss": 2.2667, + "grad_norm": 0.3283573091030121, + "learning_rate": 0.0002113896044855281, + "loss": 2.2663, "step": 149000 }, { "epoch": 7.0, - "eval_accuracy": 0.5142437004663223, - "eval_loss": 2.497164011001587, - "eval_runtime": 183.5925, - "eval_samples_per_second": 383.049, - "eval_steps_per_second": 5.986, + "eval_accuracy": 0.5142700208416108, + "eval_loss": 2.4969913959503174, + "eval_runtime": 139.7665, + "eval_samples_per_second": 503.161, + "eval_steps_per_second": 7.863, "step": 149779 }, { "epoch": 7.010328550731411, - "grad_norm": 0.3236839175224304, - "learning_rate": 0.00021063418699802998, - "loss": 2.2595, + "grad_norm": 0.3327067196369171, + "learning_rate": 0.00021063267161691162, + "loss": 2.259, "step": 150000 }, { "epoch": 7.057064074402954, - "grad_norm": 0.3207620680332184, - "learning_rate": 0.00020987649643885433, - "loss": 2.2096, + "grad_norm": 0.30795711278915405, + "learning_rate": 0.000209874981057736, + "loss": 2.2092, "step": 151000 }, { "epoch": 7.103799598074496, - "grad_norm": 0.3556087911128998, - "learning_rate": 0.0002091195635702379, - "loss": 2.2121, + "grad_norm": 0.3426005244255066, + "learning_rate": 0.00020911804818911955, + "loss": 2.2117, "step": 152000 }, { "epoch": 7.15053512174604, - "grad_norm": 0.3455691933631897, - "learning_rate": 0.00020836187301106226, - "loss": 2.225, + "grad_norm": 0.3453496992588043, + "learning_rate": 0.0002083603576299439, + "loss": 2.2247, "step": 153000 }, { "epoch": 7.197270645417582, - "grad_norm": 0.3611804246902466, - "learning_rate": 0.00020760418245188662, - "loss": 2.2173, + "grad_norm": 0.345022588968277, + "learning_rate": 0.00020760266707076826, + "loss": 2.2167, "step": 154000 }, { "epoch": 7.244006169089125, - "grad_norm": 0.3559030592441559, - "learning_rate": 0.00020684724958327017, - "loss": 2.224, + "grad_norm": 0.3381699025630951, + "learning_rate": 0.00020684573420215184, + "loss": 2.2232, "step": 155000 }, { "epoch": 7.290741692760667, - "grad_norm": 0.3480346202850342, - "learning_rate": 0.0002060903167146537, - "loss": 2.235, + "grad_norm": 0.3471800982952118, + "learning_rate": 0.0002060880436429762, + "loss": 2.2345, "step": 156000 }, { "epoch": 7.33747721643221, - "grad_norm": 0.3583781123161316, - "learning_rate": 0.0002053326261554781, - "loss": 2.2285, + "grad_norm": 0.3486413061618805, + "learning_rate": 0.00020533035308380055, + "loss": 2.228, "step": 157000 }, { "epoch": 7.384212740103753, - "grad_norm": 0.35747647285461426, - "learning_rate": 0.00020457493559630245, - "loss": 2.2334, + "grad_norm": 0.34628814458847046, + "learning_rate": 0.0002045734202151841, + "loss": 2.2329, "step": 158000 }, { "epoch": 7.430948263775296, - "grad_norm": 0.33115991950035095, - "learning_rate": 0.0002038172450371268, - "loss": 2.2417, + "grad_norm": 0.3533773720264435, + "learning_rate": 0.00020381572965600845, + "loss": 2.241, "step": 159000 }, { "epoch": 7.477683787446838, - "grad_norm": 0.362159788608551, - "learning_rate": 0.0002030595544779512, - "loss": 2.2351, + "grad_norm": 0.3611375689506531, + "learning_rate": 0.00020305879678739203, + "loss": 2.2349, "step": 160000 }, { "epoch": 7.524419311118381, - "grad_norm": 0.3421652913093567, - "learning_rate": 0.00020230262160933474, - "loss": 2.2406, + "grad_norm": 0.3317527174949646, + "learning_rate": 0.00020230110622821638, + "loss": 2.2402, "step": 161000 }, { "epoch": 7.571154834789924, - "grad_norm": 0.3309684991836548, - "learning_rate": 0.0002015449310501591, - "loss": 2.2393, + "grad_norm": 0.3507509231567383, + "learning_rate": 0.00020154341566904074, + "loss": 2.2392, "step": 162000 }, { "epoch": 7.617890358461467, - "grad_norm": 0.36814552545547485, - "learning_rate": 0.00020078724049098345, - "loss": 2.2281, + "grad_norm": 0.34763872623443604, + "learning_rate": 0.00020078648280042429, + "loss": 2.2276, "step": 163000 }, { "epoch": 7.664625882133009, - "grad_norm": 0.337954044342041, - "learning_rate": 0.00020003030762236703, - "loss": 2.2351, + "grad_norm": 0.33221837878227234, + "learning_rate": 0.0002000295499318078, + "loss": 2.2347, "step": 164000 }, { "epoch": 7.711361405804552, - "grad_norm": 0.32359594106674194, - "learning_rate": 0.00019927261706319138, - "loss": 2.2397, + "grad_norm": 0.3386111855506897, + "learning_rate": 0.00019927185937263222, + "loss": 2.2394, "step": 165000 }, { "epoch": 7.758096929476094, - "grad_norm": 0.36153310537338257, - "learning_rate": 0.00019851568419457493, - "loss": 2.2487, + "grad_norm": 0.3480163812637329, + "learning_rate": 0.00019851416881345657, + "loss": 2.248, "step": 166000 }, { "epoch": 7.804832453147638, - "grad_norm": 0.32292258739471436, - "learning_rate": 0.00019775799363539929, - "loss": 2.2384, + "grad_norm": 0.3366560935974121, + "learning_rate": 0.00019775647825428093, + "loss": 2.2382, "step": 167000 }, { "epoch": 7.85156797681918, - "grad_norm": 0.3319338262081146, - "learning_rate": 0.00019700030307622364, + "grad_norm": 0.34862497448921204, + "learning_rate": 0.0001969995453856645, "loss": 2.2451, "step": 168000 }, { "epoch": 7.898303500490723, - "grad_norm": 0.33854612708091736, - "learning_rate": 0.00019624261251704802, - "loss": 2.2433, + "grad_norm": 0.3470497131347656, + "learning_rate": 0.00019624185482648886, + "loss": 2.2426, "step": 169000 }, { "epoch": 7.9450390241622655, - "grad_norm": 0.3438510298728943, - "learning_rate": 0.00019548567964843157, - "loss": 2.2488, + "grad_norm": 0.3333776593208313, + "learning_rate": 0.00019548492195787238, + "loss": 2.2483, "step": 170000 }, { "epoch": 7.991774547833808, - "grad_norm": 0.3351077437400818, - "learning_rate": 0.0001947287467798151, - "loss": 2.2441, + "grad_norm": 0.32424941658973694, + "learning_rate": 0.00019472723139869676, + "loss": 2.2435, "step": 171000 }, { "epoch": 8.0, - "eval_accuracy": 0.5155260938360413, - "eval_loss": 2.488969326019287, - "eval_runtime": 184.3466, - "eval_samples_per_second": 381.482, - "eval_steps_per_second": 5.962, + "eval_accuracy": 0.5154590103371601, + "eval_loss": 2.489173650741577, + "eval_runtime": 139.989, + "eval_samples_per_second": 502.361, + "eval_steps_per_second": 7.851, "step": 171176 }, { "epoch": 8.038510071505351, - "grad_norm": 0.34663721919059753, - "learning_rate": 0.00019397105622063948, - "loss": 2.1914, + "grad_norm": 0.3538364768028259, + "learning_rate": 0.00019397029853008028, + "loss": 2.1913, "step": 172000 }, { "epoch": 8.085245595176893, - "grad_norm": 0.3605271875858307, - "learning_rate": 0.00019321336566146386, - "loss": 2.1837, + "grad_norm": 0.3545621633529663, + "learning_rate": 0.00019321260797090466, + "loss": 2.1835, "step": 173000 }, { "epoch": 8.131981118848437, - "grad_norm": 0.3562985956668854, - "learning_rate": 0.00019245643279284738, - "loss": 2.1945, + "grad_norm": 0.36968985199928284, + "learning_rate": 0.00019245491741172905, + "loss": 2.1939, "step": 174000 }, { "epoch": 8.17871664251998, - "grad_norm": 0.39077556133270264, - "learning_rate": 0.00019169874223367176, - "loss": 2.1957, + "grad_norm": 0.37359848618507385, + "learning_rate": 0.00019169798454311257, + "loss": 2.1955, "step": 175000 }, { "epoch": 8.225452166191522, - "grad_norm": 0.3390072286128998, - "learning_rate": 0.00019094180936505528, - "loss": 2.1947, + "grad_norm": 0.3492608368396759, + "learning_rate": 0.00019094029398393692, + "loss": 2.1942, "step": 176000 }, { "epoch": 8.272187689863065, - "grad_norm": 0.3693525195121765, + "grad_norm": 0.3621041774749756, "learning_rate": 0.00019018411880587964, - "loss": 2.2082, + "loss": 2.2079, "step": 177000 }, { "epoch": 8.318923213534609, - "grad_norm": 0.3887868821620941, - "learning_rate": 0.0001894271859372632, - "loss": 2.2082, + "grad_norm": 0.3806013762950897, + "learning_rate": 0.00018942642824670405, + "loss": 2.2077, "step": 178000 }, { "epoch": 8.36565873720615, - "grad_norm": 0.3565722107887268, - "learning_rate": 0.00018866949537808757, - "loss": 2.2017, + "grad_norm": 0.3603513836860657, + "learning_rate": 0.0001886687376875284, + "loss": 2.2008, "step": 179000 }, { "epoch": 8.412394260877694, - "grad_norm": 0.3344291150569916, - "learning_rate": 0.00018791256250947112, - "loss": 2.2127, + "grad_norm": 0.3391522765159607, + "learning_rate": 0.00018791104712835276, + "loss": 2.2125, "step": 180000 }, { "epoch": 8.459129784549235, - "grad_norm": 0.3664642572402954, - "learning_rate": 0.00018715487195029547, - "loss": 2.2137, + "grad_norm": 0.3769329786300659, + "learning_rate": 0.00018715411425973633, + "loss": 2.2134, "step": 181000 }, { "epoch": 8.505865308220779, - "grad_norm": 0.36379629373550415, + "grad_norm": 0.3605878949165344, "learning_rate": 0.00018639718139111985, - "loss": 2.2126, + "loss": 2.2121, "step": 182000 }, { "epoch": 8.552600831892322, - "grad_norm": 0.34674862027168274, - "learning_rate": 0.0001856402485225034, - "loss": 2.2106, + "grad_norm": 0.3550224304199219, + "learning_rate": 0.0001856394908319442, + "loss": 2.2102, "step": 183000 }, { "epoch": 8.599336355563864, - "grad_norm": 0.3491998314857483, - "learning_rate": 0.00018488255796332776, - "loss": 2.2122, + "grad_norm": 0.3377740681171417, + "learning_rate": 0.0001848818002727686, + "loss": 2.2121, "step": 184000 }, { "epoch": 8.646071879235407, - "grad_norm": 0.3424486219882965, - "learning_rate": 0.00018412486740415211, - "loss": 2.2144, + "grad_norm": 0.34097522497177124, + "learning_rate": 0.00018412410971359295, + "loss": 2.2137, "step": 185000 }, { "epoch": 8.692807402906949, - "grad_norm": 0.37751293182373047, + "grad_norm": 0.3758851885795593, "learning_rate": 0.00018336717684497647, - "loss": 2.2133, + "loss": 2.2128, "step": 186000 }, { "epoch": 8.739542926578492, - "grad_norm": 0.3451727628707886, - "learning_rate": 0.00018261024397636004, - "loss": 2.2104, + "grad_norm": 0.35435131192207336, + "learning_rate": 0.00018260948628580088, + "loss": 2.2096, "step": 187000 }, { "epoch": 8.786278450250036, - "grad_norm": 0.34991177916526794, - "learning_rate": 0.0001818533111077436, - "loss": 2.2203, + "grad_norm": 0.3414399325847626, + "learning_rate": 0.00018185179572662523, + "loss": 2.2194, "step": 188000 }, { "epoch": 8.833013973921577, - "grad_norm": 0.3395465314388275, - "learning_rate": 0.00018109562054856795, - "loss": 2.2196, + "grad_norm": 0.35522088408470154, + "learning_rate": 0.00018109486285800876, + "loss": 2.2189, "step": 189000 }, { "epoch": 8.87974949759312, - "grad_norm": 0.3524405360221863, - "learning_rate": 0.0001803379299893923, - "loss": 2.2167, + "grad_norm": 0.3371594250202179, + "learning_rate": 0.00018033717229883316, + "loss": 2.2161, "step": 190000 }, { "epoch": 8.926485021264662, - "grad_norm": 0.37763360142707825, - "learning_rate": 0.00017958023943021669, - "loss": 2.218, + "grad_norm": 0.3472670316696167, + "learning_rate": 0.00017957948173965752, + "loss": 2.2175, "step": 191000 }, { "epoch": 8.973220544936206, - "grad_norm": 0.33627304434776306, + "grad_norm": 0.32891738414764404, "learning_rate": 0.00017882254887104104, - "loss": 2.2176, + "loss": 2.2171, "step": 192000 }, { "epoch": 9.0, - "eval_accuracy": 0.5163104633250385, - "eval_loss": 2.4842023849487305, - "eval_runtime": 183.7529, - "eval_samples_per_second": 382.715, - "eval_steps_per_second": 5.981, + "eval_accuracy": 0.5163422485240097, + "eval_loss": 2.4830613136291504, + "eval_runtime": 139.9119, + "eval_samples_per_second": 502.638, + "eval_steps_per_second": 7.855, "step": 192573 }, { "epoch": 9.01995606860775, - "grad_norm": 0.33450332283973694, - "learning_rate": 0.0001780656160024246, - "loss": 2.1874, + "grad_norm": 0.3490764796733856, + "learning_rate": 0.00017806485831186542, + "loss": 2.1869, "step": 193000 }, { "epoch": 9.066691592279291, - "grad_norm": 0.3548380434513092, - "learning_rate": 0.00017730792544324895, - "loss": 2.1647, + "grad_norm": 0.3345907926559448, + "learning_rate": 0.0001773071677526898, + "loss": 2.1642, "step": 194000 }, { "epoch": 9.113427115950834, - "grad_norm": 0.37969735264778137, + "grad_norm": 0.37054187059402466, "learning_rate": 0.00017655023488407333, "loss": 2.1695, "step": 195000 }, { "epoch": 9.160162639622378, - "grad_norm": 0.3634478747844696, - "learning_rate": 0.00017579330201545688, - "loss": 2.1706, + "grad_norm": 0.36623892188072205, + "learning_rate": 0.0001757925443248977, + "loss": 2.1698, "step": 196000 }, { "epoch": 9.20689816329392, - "grad_norm": 0.3782573640346527, + "grad_norm": 0.36242425441741943, "learning_rate": 0.00017503561145628123, - "loss": 2.1699, + "loss": 2.1697, "step": 197000 }, { "epoch": 9.253633686965463, - "grad_norm": 0.3733539283275604, - "learning_rate": 0.00017427867858766478, - "loss": 2.1783, + "grad_norm": 0.36991021037101746, + "learning_rate": 0.00017427792089710559, + "loss": 2.1775, "step": 198000 }, { "epoch": 9.300369210637005, - "grad_norm": 0.367725133895874, - "learning_rate": 0.00017352098802848916, - "loss": 2.1825, + "grad_norm": 0.3689287006855011, + "learning_rate": 0.00017352023033793, + "loss": 2.1818, "step": 199000 }, { "epoch": 9.347104734308548, - "grad_norm": 0.3784054219722748, - "learning_rate": 0.0001727640551598727, - "loss": 2.1842, + "grad_norm": 0.37551766633987427, + "learning_rate": 0.00017276329746931352, + "loss": 2.1836, "step": 200000 }, { "epoch": 9.393840257980091, - "grad_norm": 0.37121427059173584, - "learning_rate": 0.00017200636460069707, - "loss": 2.1854, + "grad_norm": 0.3720178008079529, + "learning_rate": 0.00017200560691013787, + "loss": 2.1852, "step": 201000 }, { "epoch": 9.440575781651633, - "grad_norm": 0.3519172668457031, - "learning_rate": 0.00017124867404152142, - "loss": 2.1841, + "grad_norm": 0.3440088927745819, + "learning_rate": 0.00017124791635096225, + "loss": 2.1836, "step": 202000 }, { "epoch": 9.487311305323177, - "grad_norm": 0.37216153740882874, + "grad_norm": 0.37482950091362, "learning_rate": 0.00017049098348234578, - "loss": 2.185, + "loss": 2.1847, "step": 203000 }, { "epoch": 9.534046828994718, - "grad_norm": 0.3630688488483429, - "learning_rate": 0.00016973405061372935, - "loss": 2.1849, + "grad_norm": 0.36125728487968445, + "learning_rate": 0.00016973329292317016, + "loss": 2.1841, "step": 204000 }, { "epoch": 9.580782352666262, - "grad_norm": 0.33998388051986694, - "learning_rate": 0.0001689763600545537, - "loss": 2.1911, + "grad_norm": 0.3412284851074219, + "learning_rate": 0.00016897560236399454, + "loss": 2.1907, "step": 205000 }, { "epoch": 9.627517876337805, - "grad_norm": 0.3803759515285492, - "learning_rate": 0.00016821942718593726, - "loss": 2.1905, + "grad_norm": 0.36377474665641785, + "learning_rate": 0.00016821866949537806, + "loss": 2.1899, "step": 206000 }, { "epoch": 9.674253400009347, - "grad_norm": 0.3438846170902252, + "grad_norm": 0.3450999855995178, "learning_rate": 0.0001674617366267616, - "loss": 2.1997, + "loss": 2.1993, "step": 207000 }, { "epoch": 9.72098892368089, - "grad_norm": 0.3872727155685425, + "grad_norm": 0.40802010893821716, "learning_rate": 0.000166704046067586, - "loss": 2.1918, + "loss": 2.1916, "step": 208000 }, { "epoch": 9.767724447352432, - "grad_norm": 0.32597804069519043, + "grad_norm": 0.3310614228248596, "learning_rate": 0.00016594635550841035, - "loss": 2.1798, + "loss": 2.1793, "step": 209000 }, { "epoch": 9.814459971023975, - "grad_norm": 0.3644990026950836, - "learning_rate": 0.00016519018033035306, - "loss": 2.1933, + "grad_norm": 0.3661220669746399, + "learning_rate": 0.0001651886649492347, + "loss": 2.193, "step": 210000 }, { "epoch": 9.861195494695519, - "grad_norm": 0.3531164228916168, - "learning_rate": 0.00016443248977117742, - "loss": 2.1937, + "grad_norm": 0.35508328676223755, + "learning_rate": 0.0001644309743900591, + "loss": 2.1936, "step": 211000 }, { "epoch": 9.90793101836706, - "grad_norm": 0.3724701702594757, - "learning_rate": 0.00016367479921200183, - "loss": 2.2031, + "grad_norm": 0.3655872941017151, + "learning_rate": 0.00016367404152144263, + "loss": 2.2022, "step": 212000 }, { "epoch": 9.954666542038604, - "grad_norm": 0.3369377851486206, - "learning_rate": 0.00016291786634338535, - "loss": 2.1899, + "grad_norm": 0.3304108679294586, + "learning_rate": 0.000162916350962267, + "loss": 2.1902, "step": 213000 }, { "epoch": 10.0, - "eval_accuracy": 0.5169506283850192, - "eval_loss": 2.4832394123077393, - "eval_runtime": 183.5783, - "eval_samples_per_second": 383.079, - "eval_steps_per_second": 5.987, + "eval_accuracy": 0.5171048702452897, + "eval_loss": 2.4811017513275146, + "eval_runtime": 139.6797, + "eval_samples_per_second": 503.473, + "eval_steps_per_second": 7.868, "step": 213970 }, { "epoch": 10.001402065710145, - "grad_norm": 0.3664819300174713, - "learning_rate": 0.0001621609334747689, - "loss": 2.1935, + "grad_norm": 0.3663347065448761, + "learning_rate": 0.00016215941809365054, + "loss": 2.1928, "step": 214000 }, { "epoch": 10.048137589381689, - "grad_norm": 0.3685780465602875, - "learning_rate": 0.00016140324291559325, - "loss": 2.1375, + "grad_norm": 0.37336432933807373, + "learning_rate": 0.0001614017275344749, + "loss": 2.1367, "step": 215000 }, { "epoch": 10.094873113053232, - "grad_norm": 0.36441028118133545, - "learning_rate": 0.0001606455523564176, - "loss": 2.1397, + "grad_norm": 0.3609987795352936, + "learning_rate": 0.00016064403697529925, + "loss": 2.1386, "step": 216000 }, { "epoch": 10.141608636724774, - "grad_norm": 0.3830730617046356, - "learning_rate": 0.000159887861797242, - "loss": 2.1525, + "grad_norm": 0.3735845685005188, + "learning_rate": 0.00015988710410668282, + "loss": 2.1516, "step": 217000 }, { "epoch": 10.188344160396317, - "grad_norm": 0.36895737051963806, - "learning_rate": 0.00015913017123806637, - "loss": 2.1468, + "grad_norm": 0.36659902334213257, + "learning_rate": 0.00015912941354750718, + "loss": 2.1463, "step": 218000 }, { "epoch": 10.23507968406786, - "grad_norm": 0.35749468207359314, + "grad_norm": 0.39549151062965393, "learning_rate": 0.00015837248067889073, - "loss": 2.1526, + "loss": 2.1518, "step": 219000 }, { "epoch": 10.281815207739402, - "grad_norm": 0.38449355959892273, - "learning_rate": 0.00015761554781027425, - "loss": 2.151, + "grad_norm": 0.38731181621551514, + "learning_rate": 0.00015761479011971508, + "loss": 2.1506, "step": 220000 }, { "epoch": 10.328550731410946, - "grad_norm": 0.39128056168556213, - "learning_rate": 0.00015685785725109866, + "grad_norm": 0.4099198579788208, + "learning_rate": 0.00015685709956053947, "loss": 2.1625, "step": 221000 }, { "epoch": 10.375286255082488, - "grad_norm": 0.3806777596473694, - "learning_rate": 0.00015610168207304137, - "loss": 2.1613, + "grad_norm": 0.393976092338562, + "learning_rate": 0.00015609940900136382, + "loss": 2.1607, "step": 222000 }, { "epoch": 10.422021778754031, - "grad_norm": 0.3579350411891937, - "learning_rate": 0.00015534399151386573, - "loss": 2.1629, + "grad_norm": 0.3783334791660309, + "learning_rate": 0.00015534323382330654, + "loss": 2.1624, "step": 223000 }, { "epoch": 10.468757302425574, - "grad_norm": 0.3410700857639313, - "learning_rate": 0.00015458705864524925, - "loss": 2.162, + "grad_norm": 0.3319580554962158, + "learning_rate": 0.00015458554326413092, + "loss": 2.1616, "step": 224000 }, { "epoch": 10.515492826097116, - "grad_norm": 0.36416828632354736, - "learning_rate": 0.00015382936808607366, - "loss": 2.1618, + "grad_norm": 0.37541308999061584, + "learning_rate": 0.0001538278527049553, + "loss": 2.1611, "step": 225000 }, { "epoch": 10.56222834976866, - "grad_norm": 0.3888450860977173, - "learning_rate": 0.00015307243521745718, - "loss": 2.1703, + "grad_norm": 0.4089110195636749, + "learning_rate": 0.00015307016214577966, + "loss": 2.1702, "step": 226000 }, { "epoch": 10.608963873440201, - "grad_norm": 0.3778453469276428, - "learning_rate": 0.00015231474465828154, - "loss": 2.1659, + "grad_norm": 0.3808048665523529, + "learning_rate": 0.000152312471586604, + "loss": 2.1654, "step": 227000 }, { "epoch": 10.655699397111745, - "grad_norm": 0.3809790313243866, - "learning_rate": 0.00015155781178966508, - "loss": 2.1786, + "grad_norm": 0.35533905029296875, + "learning_rate": 0.00015155478102742837, + "loss": 2.1781, "step": 228000 }, { "epoch": 10.702434920783288, - "grad_norm": 0.35307931900024414, - "learning_rate": 0.00015080012123048944, - "loss": 2.1743, + "grad_norm": 0.35690027475357056, + "learning_rate": 0.00015079860584937108, + "loss": 2.1738, "step": 229000 }, { "epoch": 10.74917044445483, - "grad_norm": 0.34989428520202637, - "learning_rate": 0.0001500424306713138, - "loss": 2.1838, + "grad_norm": 0.35883113741874695, + "learning_rate": 0.0001500409152901955, + "loss": 2.1836, "step": 230000 }, { "epoch": 10.795905968126373, - "grad_norm": 0.372887521982193, - "learning_rate": 0.00014928625549325654, - "loss": 2.1731, + "grad_norm": 0.38277342915534973, + "learning_rate": 0.00014928322473101985, + "loss": 2.173, "step": 231000 }, { "epoch": 10.842641491797915, - "grad_norm": 0.34077638387680054, - "learning_rate": 0.0001485285649340809, - "loss": 2.1707, + "grad_norm": 0.35880765318870544, + "learning_rate": 0.0001485262918624034, + "loss": 2.1704, "step": 232000 }, { "epoch": 10.889377015469458, - "grad_norm": 0.38488736748695374, - "learning_rate": 0.00014777087437490527, - "loss": 2.1761, + "grad_norm": 0.38934314250946045, + "learning_rate": 0.00014776860130322775, + "loss": 2.1755, "step": 233000 }, { "epoch": 10.936112539141002, - "grad_norm": 0.36446672677993774, - "learning_rate": 0.00014701318381572963, - "loss": 2.1727, + "grad_norm": 0.37256234884262085, + "learning_rate": 0.00014701091074405213, + "loss": 2.1725, "step": 234000 }, { "epoch": 10.982848062812543, - "grad_norm": 0.4053245186805725, - "learning_rate": 0.000146255493256554, - "loss": 2.1699, + "grad_norm": 0.4027036428451538, + "learning_rate": 0.00014625397787543565, + "loss": 2.1695, "step": 235000 }, { "epoch": 11.0, - "eval_accuracy": 0.5176925060817075, - "eval_loss": 2.4809539318084717, - "eval_runtime": 183.5125, - "eval_samples_per_second": 383.216, - "eval_steps_per_second": 5.989, + "eval_accuracy": 0.5177156479370988, + "eval_loss": 2.478761911392212, + "eval_runtime": 140.1918, + "eval_samples_per_second": 501.634, + "eval_steps_per_second": 7.839, "step": 235367 }, { "epoch": 11.029583586484087, - "grad_norm": 0.37868165969848633, - "learning_rate": 0.00014549856038793756, - "loss": 2.1413, + "grad_norm": 0.3955394923686981, + "learning_rate": 0.00014549628731626003, + "loss": 2.1411, "step": 236000 }, { "epoch": 11.076319110155628, - "grad_norm": 0.366122841835022, - "learning_rate": 0.00014474086982876192, - "loss": 2.1282, + "grad_norm": 0.38433390855789185, + "learning_rate": 0.0001447385967570844, + "loss": 2.128, "step": 237000 }, { "epoch": 11.123054633827172, - "grad_norm": 0.35669025778770447, - "learning_rate": 0.0001439831792695863, - "loss": 2.1253, + "grad_norm": 0.35297998785972595, + "learning_rate": 0.00014398090619790877, + "loss": 2.125, "step": 238000 }, { "epoch": 11.169790157498715, - "grad_norm": 0.43152889609336853, - "learning_rate": 0.00014322548871041065, - "loss": 2.1358, + "grad_norm": 0.40829989314079285, + "learning_rate": 0.0001432239733292923, + "loss": 2.1349, "step": 239000 }, { "epoch": 11.216525681170257, - "grad_norm": 0.37372422218322754, - "learning_rate": 0.00014246931353235337, - "loss": 2.1377, + "grad_norm": 0.3643953502178192, + "learning_rate": 0.00014246628277011668, + "loss": 2.1368, "step": 240000 }, { "epoch": 11.2632612048418, - "grad_norm": 0.37025725841522217, - "learning_rate": 0.00014171162297317772, - "loss": 2.1344, + "grad_norm": 0.3675245940685272, + "learning_rate": 0.00014170859221094103, + "loss": 2.134, "step": 241000 }, { "epoch": 11.309996728513344, - "grad_norm": 0.37420186400413513, - "learning_rate": 0.0001409539324140021, + "grad_norm": 0.3534744679927826, + "learning_rate": 0.00014095165934232458, "loss": 2.1372, "step": 242000 }, { "epoch": 11.356732252184885, - "grad_norm": 0.4168432652950287, - "learning_rate": 0.00014019699954538565, - "loss": 2.1358, + "grad_norm": 0.4113076627254486, + "learning_rate": 0.00014019396878314896, + "loss": 2.1354, "step": 243000 }, { "epoch": 11.403467775856429, - "grad_norm": 0.38983049988746643, - "learning_rate": 0.00013943930898621, - "loss": 2.1417, + "grad_norm": 0.38102054595947266, + "learning_rate": 0.00013943703591453248, + "loss": 2.1411, "step": 244000 }, { "epoch": 11.45020329952797, - "grad_norm": 0.42163345217704773, - "learning_rate": 0.00013868237611759356, - "loss": 2.1349, + "grad_norm": 0.3844783306121826, + "learning_rate": 0.00013867934535535687, + "loss": 2.1341, "step": 245000 }, { "epoch": 11.496938823199514, - "grad_norm": 0.33661991357803345, - "learning_rate": 0.00013792468555841794, - "loss": 2.1423, + "grad_norm": 0.3427509665489197, + "learning_rate": 0.0001379224124867404, + "loss": 2.1416, "step": 246000 }, { "epoch": 11.543674346871057, - "grad_norm": 0.382899671792984, - "learning_rate": 0.00013716775268980146, - "loss": 2.1383, + "grad_norm": 0.3603721261024475, + "learning_rate": 0.00013716472192756477, + "loss": 2.1376, "step": 247000 }, { "epoch": 11.590409870542599, - "grad_norm": 0.38625696301460266, - "learning_rate": 0.00013641006213062584, - "loss": 2.1489, + "grad_norm": 0.4084089696407318, + "learning_rate": 0.00013640778905894832, + "loss": 2.1488, "step": 248000 }, { "epoch": 11.637145394214143, - "grad_norm": 0.3666653633117676, - "learning_rate": 0.0001356523715714502, - "loss": 2.1569, + "grad_norm": 0.35276925563812256, + "learning_rate": 0.00013565009849977267, + "loss": 2.1566, "step": 249000 }, { "epoch": 11.683880917885684, - "grad_norm": 0.36036914587020874, - "learning_rate": 0.00013489543870283375, - "loss": 2.1537, + "grad_norm": 0.38506028056144714, + "learning_rate": 0.00013489240794059706, + "loss": 2.1526, "step": 250000 }, { "epoch": 11.730616441557228, - "grad_norm": 0.38993409276008606, - "learning_rate": 0.00013413774814365813, - "loss": 2.1538, + "grad_norm": 0.3763647675514221, + "learning_rate": 0.0001341347173814214, + "loss": 2.1532, "step": 251000 }, { "epoch": 11.777351965228771, - "grad_norm": 0.37135306000709534, - "learning_rate": 0.00013338081527504165, - "loss": 2.1466, + "grad_norm": 0.37144818902015686, + "learning_rate": 0.00013337778451280496, + "loss": 2.146, "step": 252000 }, { "epoch": 11.824087488900313, - "grad_norm": 0.37696388363838196, - "learning_rate": 0.00013262312471586603, + "grad_norm": 0.4111426770687103, + "learning_rate": 0.00013262009395362934, "loss": 2.1556, "step": 253000 }, { "epoch": 11.870823012571856, - "grad_norm": 0.40719184279441833, - "learning_rate": 0.00013186543415669042, - "loss": 2.1623, + "grad_norm": 0.3937530517578125, + "learning_rate": 0.00013186316108501286, + "loss": 2.1617, "step": 254000 }, { "epoch": 11.917558536243398, - "grad_norm": 0.3529217541217804, - "learning_rate": 0.00013110774359751477, - "loss": 2.1526, + "grad_norm": 0.3688524067401886, + "learning_rate": 0.00013110547052583722, + "loss": 2.1523, "step": 255000 }, { "epoch": 11.964294059914941, - "grad_norm": 0.38155487179756165, - "learning_rate": 0.0001303508107288983, - "loss": 2.1554, + "grad_norm": 0.3319390118122101, + "learning_rate": 0.0001303485376572208, + "loss": 2.1548, "step": 256000 }, { "epoch": 12.0, - "eval_accuracy": 0.5181830576524957, - "eval_loss": 2.4812095165252686, - "eval_runtime": 183.6323, - "eval_samples_per_second": 382.966, - "eval_steps_per_second": 5.985, + "eval_accuracy": 0.5181523877236639, + "eval_loss": 2.4811060428619385, + "eval_runtime": 139.8944, + "eval_samples_per_second": 502.701, + "eval_steps_per_second": 7.856, "step": 256764 }, { "epoch": 12.011029583586485, - "grad_norm": 0.3908240795135498, - "learning_rate": 0.00012959387786028184, - "loss": 2.1357, + "grad_norm": 0.41884076595306396, + "learning_rate": 0.00012959084709804515, + "loss": 2.1358, "step": 257000 }, { "epoch": 12.057765107258026, - "grad_norm": 0.38744133710861206, - "learning_rate": 0.00012883618730110622, - "loss": 2.1031, + "grad_norm": 0.39449524879455566, + "learning_rate": 0.0001288339142294287, + "loss": 2.1027, "step": 258000 }, { "epoch": 12.10450063092957, - "grad_norm": 0.39171701669692993, - "learning_rate": 0.00012807849674193058, - "loss": 2.1114, + "grad_norm": 0.3929855227470398, + "learning_rate": 0.00012807622367025305, + "loss": 2.111, "step": 259000 }, { "epoch": 12.151236154601113, - "grad_norm": 0.41075876355171204, - "learning_rate": 0.00012732080618275496, - "loss": 2.1134, + "grad_norm": 0.4155491888523102, + "learning_rate": 0.00012731853311107744, + "loss": 2.113, "step": 260000 }, { "epoch": 12.197971678272655, - "grad_norm": 0.4411497712135315, - "learning_rate": 0.0001265638733141385, - "loss": 2.1155, + "grad_norm": 0.40978580713272095, + "learning_rate": 0.0001265608425519018, + "loss": 2.1148, "step": 261000 }, { "epoch": 12.244707201944198, - "grad_norm": 0.3990738093852997, - "learning_rate": 0.00012580618275496286, - "loss": 2.1205, + "grad_norm": 0.3888361155986786, + "learning_rate": 0.00012580390968328534, + "loss": 2.1199, "step": 262000 }, { "epoch": 12.29144272561574, - "grad_norm": 0.4064374566078186, - "learning_rate": 0.00012504924988634639, - "loss": 2.1166, + "grad_norm": 0.4125273823738098, + "learning_rate": 0.0001250462191241097, + "loss": 2.1161, "step": 263000 }, { "epoch": 12.338178249287283, - "grad_norm": 0.40419068932533264, - "learning_rate": 0.00012429155932717077, + "grad_norm": 0.39474251866340637, + "learning_rate": 0.00012428928625549324, "loss": 2.1222, "step": 264000 }, { "epoch": 12.384913772958827, - "grad_norm": 0.37660491466522217, - "learning_rate": 0.00012353462645855432, - "loss": 2.1313, + "grad_norm": 0.3732432425022125, + "learning_rate": 0.00012353159569631762, + "loss": 2.1307, "step": 265000 }, { "epoch": 12.431649296630368, - "grad_norm": 0.4027993381023407, - "learning_rate": 0.00012277693589937867, - "loss": 2.1206, + "grad_norm": 0.43988922238349915, + "learning_rate": 0.00012277390513714198, + "loss": 2.1201, "step": 266000 }, { "epoch": 12.478384820301912, - "grad_norm": 0.39191654324531555, - "learning_rate": 0.00012201924534020305, - "loss": 2.1279, + "grad_norm": 0.406859815120697, + "learning_rate": 0.00012201697226852552, + "loss": 2.1277, "step": 267000 }, { "epoch": 12.525120343973454, - "grad_norm": 0.41823610663414, - "learning_rate": 0.00012126231247158659, - "loss": 2.1291, + "grad_norm": 0.409138560295105, + "learning_rate": 0.0001212592817093499, + "loss": 2.1288, "step": 268000 }, { "epoch": 12.571855867644997, - "grad_norm": 0.3858267664909363, - "learning_rate": 0.00012050462191241096, - "loss": 2.1265, + "grad_norm": 0.3730980455875397, + "learning_rate": 0.00012050159115017425, + "loss": 2.126, "step": 269000 }, { "epoch": 12.61859139131654, - "grad_norm": 0.4283806383609772, - "learning_rate": 0.00011974693135323533, - "loss": 2.1296, + "grad_norm": 0.4209441542625427, + "learning_rate": 0.0001197446582815578, + "loss": 2.1291, "step": 270000 }, { "epoch": 12.665326914988082, - "grad_norm": 0.36854031682014465, - "learning_rate": 0.0001189892407940597, - "loss": 2.1351, + "grad_norm": 0.35842010378837585, + "learning_rate": 0.00011898696772238217, + "loss": 2.135, "step": 271000 }, { "epoch": 12.712062438659625, - "grad_norm": 0.3683795928955078, - "learning_rate": 0.00011823230792544323, - "loss": 2.1203, + "grad_norm": 0.37030935287475586, + "learning_rate": 0.00011822927716320654, + "loss": 2.1202, "step": 272000 }, { "epoch": 12.758797962331167, - "grad_norm": 0.40869027376174927, - "learning_rate": 0.00011747461736626761, - "loss": 2.1343, + "grad_norm": 0.4215924143791199, + "learning_rate": 0.0001174715866040309, + "loss": 2.1339, "step": 273000 }, { "epoch": 12.80553348600271, - "grad_norm": 0.3861792981624603, - "learning_rate": 0.00011671768449765115, - "loss": 2.1327, + "grad_norm": 0.40709611773490906, + "learning_rate": 0.00011671541142597361, + "loss": 2.1321, "step": 274000 }, { "epoch": 12.852269009674254, - "grad_norm": 0.3687632381916046, - "learning_rate": 0.00011595999393847552, - "loss": 2.1362, + "grad_norm": 0.372782438993454, + "learning_rate": 0.00011595772086679799, + "loss": 2.1355, "step": 275000 }, { "epoch": 12.899004533345796, - "grad_norm": 0.40457507967948914, - "learning_rate": 0.00011520230337929988, - "loss": 2.1375, + "grad_norm": 0.40553081035614014, + "learning_rate": 0.00011520003030762236, + "loss": 2.1371, "step": 276000 }, { "epoch": 12.945740057017339, - "grad_norm": 0.37788259983062744, - "learning_rate": 0.00011444537051068342, - "loss": 2.1384, + "grad_norm": 0.37446466088294983, + "learning_rate": 0.0001144430974390059, + "loss": 2.1375, "step": 277000 }, { "epoch": 12.992475580688883, - "grad_norm": 0.3549434542655945, - "learning_rate": 0.00011368767995150779, - "loss": 2.131, + "grad_norm": 0.355365127325058, + "learning_rate": 0.00011368540687983028, + "loss": 2.1307, "step": 278000 }, { "epoch": 13.0, - "eval_accuracy": 0.5186614527787653, - "eval_loss": 2.4790453910827637, - "eval_runtime": 183.9494, - "eval_samples_per_second": 382.306, - "eval_steps_per_second": 5.974, + "eval_accuracy": 0.5186005590291574, + "eval_loss": 2.4787793159484863, + "eval_runtime": 140.1152, + "eval_samples_per_second": 501.908, + "eval_steps_per_second": 7.844, "step": 278161 }, { "epoch": 13.039211104360424, - "grad_norm": 0.38944530487060547, - "learning_rate": 0.00011292998939233217, - "loss": 2.0927, + "grad_norm": 0.38456571102142334, + "learning_rate": 0.00011292847401121381, + "loss": 2.0919, "step": 279000 }, { "epoch": 13.085946628031968, - "grad_norm": 0.3954192101955414, - "learning_rate": 0.0001121730565237157, - "loss": 2.0879, + "grad_norm": 0.36770784854888916, + "learning_rate": 0.00011217078345203817, + "loss": 2.0873, "step": 280000 }, { "epoch": 13.13268215170351, - "grad_norm": 0.39661821722984314, - "learning_rate": 0.00011141536596454006, - "loss": 2.0967, + "grad_norm": 0.3647533357143402, + "learning_rate": 0.00011141309289286255, + "loss": 2.0963, "step": 281000 }, { "epoch": 13.179417675375053, - "grad_norm": 0.4078295826911926, - "learning_rate": 0.00011065767540536444, - "loss": 2.0992, + "grad_norm": 0.39681529998779297, + "learning_rate": 0.00011065616002424608, + "loss": 2.0991, "step": 282000 }, { "epoch": 13.226153199046596, - "grad_norm": 0.41279336810112, - "learning_rate": 0.00010990074253674798, - "loss": 2.1065, + "grad_norm": 0.44981902837753296, + "learning_rate": 0.00010989846946507045, + "loss": 2.1062, "step": 283000 }, { "epoch": 13.272888722718138, - "grad_norm": 0.4141485095024109, - "learning_rate": 0.00010914305197757235, - "loss": 2.101, + "grad_norm": 0.40634024143218994, + "learning_rate": 0.00010914077890589482, + "loss": 2.1009, "step": 284000 }, { "epoch": 13.319624246389681, - "grad_norm": 0.37772706151008606, - "learning_rate": 0.00010838536141839672, - "loss": 2.1033, + "grad_norm": 0.37954410910606384, + "learning_rate": 0.00010838384603727837, + "loss": 2.1022, "step": 285000 }, { "epoch": 13.366359770061223, - "grad_norm": 0.42048418521881104, - "learning_rate": 0.00010762842854978026, - "loss": 2.1037, + "grad_norm": 0.4176303446292877, + "learning_rate": 0.00010762615547810273, + "loss": 2.1035, "step": 286000 }, { "epoch": 13.413095293732766, - "grad_norm": 0.3956892192363739, - "learning_rate": 0.00010687073799060462, - "loss": 2.1035, + "grad_norm": 0.398928701877594, + "learning_rate": 0.00010686922260948629, + "loss": 2.103, "step": 287000 }, { "epoch": 13.45983081740431, - "grad_norm": 0.40276169776916504, - "learning_rate": 0.000106113047431429, - "loss": 2.1035, + "grad_norm": 0.4115624725818634, + "learning_rate": 0.00010611153205031064, + "loss": 2.1032, "step": 288000 }, { "epoch": 13.506566341075851, - "grad_norm": 0.37681591510772705, - "learning_rate": 0.00010535611456281254, - "loss": 2.1085, + "grad_norm": 0.40450263023376465, + "learning_rate": 0.00010535459918169418, + "loss": 2.1083, "step": 289000 }, { "epoch": 13.553301864747395, - "grad_norm": 0.4368800222873688, - "learning_rate": 0.0001045984240036369, - "loss": 2.1182, + "grad_norm": 0.44425278902053833, + "learning_rate": 0.00010459690862251856, + "loss": 2.1176, "step": 290000 }, { "epoch": 13.600037388418937, - "grad_norm": 0.3877386152744293, - "learning_rate": 0.00010384073344446127, - "loss": 2.1126, + "grad_norm": 0.3785880506038666, + "learning_rate": 0.0001038399757539021, + "loss": 2.1124, "step": 291000 }, { "epoch": 13.64677291209048, - "grad_norm": 0.3556603491306305, - "learning_rate": 0.00010308380057584481, - "loss": 2.1111, + "grad_norm": 0.3684954047203064, + "learning_rate": 0.00010308228519472646, + "loss": 2.1108, "step": 292000 }, { "epoch": 13.693508435762023, - "grad_norm": 0.40014851093292236, - "learning_rate": 0.00010232611001666918, - "loss": 2.1132, + "grad_norm": 0.3949037194252014, + "learning_rate": 0.00010232459463555083, + "loss": 2.1129, "step": 293000 }, { "epoch": 13.740243959433565, - "grad_norm": 0.4012806713581085, - "learning_rate": 0.00010156917714805273, - "loss": 2.1129, + "grad_norm": 0.41194283962249756, + "learning_rate": 0.0001015669040763752, + "loss": 2.1126, "step": 294000 }, { "epoch": 13.786979483105108, - "grad_norm": 0.42094719409942627, - "learning_rate": 0.0001008114865888771, - "loss": 2.1169, + "grad_norm": 0.4066857397556305, + "learning_rate": 0.00010080921351719956, + "loss": 2.1167, "step": 295000 }, { "epoch": 13.83371500677665, - "grad_norm": 0.40798112750053406, - "learning_rate": 0.00010005379602970145, + "grad_norm": 0.4021887481212616, + "learning_rate": 0.00010005228064858312, "loss": 2.1097, "step": 296000 }, { "epoch": 13.880450530448194, - "grad_norm": 0.3869474530220032, - "learning_rate": 9.929686316108501e-05, - "loss": 2.1168, + "grad_norm": 0.3883073329925537, + "learning_rate": 9.929459008940747e-05, + "loss": 2.1163, "step": 297000 }, { "epoch": 13.927186054119737, - "grad_norm": 0.4175203740596771, - "learning_rate": 9.853917260190937e-05, - "loss": 2.1117, + "grad_norm": 0.41835400462150574, + "learning_rate": 9.853841491135019e-05, + "loss": 2.1113, "step": 298000 }, { "epoch": 13.973921577791279, - "grad_norm": 0.36646854877471924, - "learning_rate": 9.778223973329292e-05, - "loss": 2.1231, + "grad_norm": 0.378459632396698, + "learning_rate": 9.778072435217456e-05, + "loss": 2.1228, "step": 299000 }, { "epoch": 14.0, - "eval_accuracy": 0.5187720318130807, - "eval_loss": 2.481027841567993, - "eval_runtime": 184.1596, - "eval_samples_per_second": 381.87, - "eval_steps_per_second": 5.968, + "eval_accuracy": 0.5187517896600518, + "eval_loss": 2.480222225189209, + "eval_runtime": 139.9054, + "eval_samples_per_second": 502.661, + "eval_steps_per_second": 7.855, "step": 299558 }, { "epoch": 14.020657101462822, - "grad_norm": 0.3873392343521118, - "learning_rate": 9.702530686467645e-05, - "loss": 2.1009, + "grad_norm": 0.40461719036102295, + "learning_rate": 9.702303379299894e-05, + "loss": 2.1005, "step": 300000 }, { "epoch": 14.067392625134366, - "grad_norm": 0.4304318428039551, - "learning_rate": 9.626761630550083e-05, - "loss": 2.0792, + "grad_norm": 0.44079822301864624, + "learning_rate": 9.626610092438247e-05, + "loss": 2.0786, "step": 301000 }, { "epoch": 14.114128148805907, - "grad_norm": 0.36497995257377625, - "learning_rate": 9.550992574632519e-05, - "loss": 2.0768, + "grad_norm": 0.40090832114219666, + "learning_rate": 9.550841036520683e-05, + "loss": 2.0766, "step": 302000 }, { "epoch": 14.16086367247745, - "grad_norm": 0.38602423667907715, - "learning_rate": 9.475223518714957e-05, - "loss": 2.0741, + "grad_norm": 0.375640869140625, + "learning_rate": 9.475071980603121e-05, + "loss": 2.0735, "step": 303000 }, { "epoch": 14.207599196148992, - "grad_norm": 0.4310854375362396, - "learning_rate": 9.399454462797393e-05, - "loss": 2.078, + "grad_norm": 0.44055065512657166, + "learning_rate": 9.399302924685557e-05, + "loss": 2.0775, "step": 304000 }, { "epoch": 14.254334719820536, - "grad_norm": 0.3950233459472656, - "learning_rate": 9.323761175935746e-05, - "loss": 2.0915, + "grad_norm": 0.4058537781238556, + "learning_rate": 9.323609637823912e-05, + "loss": 2.0912, "step": 305000 }, { "epoch": 14.30107024349208, - "grad_norm": 0.40887415409088135, - "learning_rate": 9.247992120018184e-05, - "loss": 2.0884, + "grad_norm": 0.3981979191303253, + "learning_rate": 9.247840581906348e-05, + "loss": 2.0874, "step": 306000 }, { "epoch": 14.34780576716362, - "grad_norm": 0.4131264388561249, - "learning_rate": 9.172298833156538e-05, - "loss": 2.0976, + "grad_norm": 0.40232914686203003, + "learning_rate": 9.172071525988785e-05, + "loss": 2.0974, "step": 307000 }, { "epoch": 14.394541290835164, - "grad_norm": 0.41139036417007446, - "learning_rate": 9.096529777238975e-05, - "loss": 2.0886, + "grad_norm": 0.4231472313404083, + "learning_rate": 9.096378239127139e-05, + "loss": 2.0876, "step": 308000 }, { "epoch": 14.441276814506706, - "grad_norm": 0.44874390959739685, - "learning_rate": 9.020760721321412e-05, - "loss": 2.0889, + "grad_norm": 0.43963202834129333, + "learning_rate": 9.020609183209577e-05, + "loss": 2.0878, "step": 309000 }, { "epoch": 14.48801233817825, - "grad_norm": 0.38657814264297485, - "learning_rate": 8.944991665403849e-05, - "loss": 2.0933, + "grad_norm": 0.37431785464286804, + "learning_rate": 8.94491589634793e-05, + "loss": 2.0928, "step": 310000 }, { "epoch": 14.534747861849793, - "grad_norm": 0.3783547580242157, - "learning_rate": 8.869222609486284e-05, - "loss": 2.0852, + "grad_norm": 0.39257049560546875, + "learning_rate": 8.869146840430366e-05, + "loss": 2.0847, "step": 311000 }, { "epoch": 14.581483385521334, - "grad_norm": 0.35217922925949097, - "learning_rate": 8.79352932262464e-05, - "loss": 2.0961, + "grad_norm": 0.363940954208374, + "learning_rate": 8.793453553568722e-05, + "loss": 2.096, "step": 312000 }, { "epoch": 14.628218909192878, - "grad_norm": 0.3531019389629364, - "learning_rate": 8.717760266707076e-05, - "loss": 2.0972, + "grad_norm": 0.3870943784713745, + "learning_rate": 8.717684497651158e-05, + "loss": 2.0969, "step": 313000 }, { "epoch": 14.67495443286442, - "grad_norm": 0.43090537190437317, - "learning_rate": 8.64206697984543e-05, - "loss": 2.0943, + "grad_norm": 0.44007956981658936, + "learning_rate": 8.641991210789513e-05, + "loss": 2.0938, "step": 314000 }, { "epoch": 14.721689956535963, - "grad_norm": 0.41564884781837463, - "learning_rate": 8.566297923927867e-05, - "loss": 2.0997, + "grad_norm": 0.4328775703907013, + "learning_rate": 8.56622215487195e-05, + "loss": 2.0994, "step": 315000 }, { "epoch": 14.768425480207506, - "grad_norm": 0.4131323993206024, - "learning_rate": 8.490604637066222e-05, - "loss": 2.0906, + "grad_norm": 0.4116855561733246, + "learning_rate": 8.490453098954386e-05, + "loss": 2.09, "step": 316000 }, { "epoch": 14.815161003879048, - "grad_norm": 0.40813466906547546, - "learning_rate": 8.414835581148658e-05, - "loss": 2.103, + "grad_norm": 0.4201270341873169, + "learning_rate": 8.414684043036822e-05, + "loss": 2.1029, "step": 317000 }, { "epoch": 14.861896527550591, - "grad_norm": 0.4064253866672516, - "learning_rate": 8.339142294287011e-05, - "loss": 2.1013, + "grad_norm": 0.4016938805580139, + "learning_rate": 8.338990756175178e-05, + "loss": 2.1011, "step": 318000 }, { "epoch": 14.908632051222135, - "grad_norm": 0.4227573275566101, - "learning_rate": 8.26337323836945e-05, - "loss": 2.0943, + "grad_norm": 0.4174578785896301, + "learning_rate": 8.263221700257614e-05, + "loss": 2.0933, "step": 319000 }, { "epoch": 14.955367574893677, - "grad_norm": 0.4152744710445404, - "learning_rate": 8.187679951507803e-05, - "loss": 2.0991, + "grad_norm": 0.4125044345855713, + "learning_rate": 8.187528413395969e-05, + "loss": 2.0984, "step": 320000 }, { "epoch": 15.0, - "eval_accuracy": 0.5190574852052445, - "eval_loss": 2.4825658798217773, - "eval_runtime": 183.7288, - "eval_samples_per_second": 382.765, - "eval_steps_per_second": 5.982, + "eval_accuracy": 0.5190152164669636, + "eval_loss": 2.4807238578796387, + "eval_runtime": 139.9616, + "eval_samples_per_second": 502.459, + "eval_steps_per_second": 7.852, "step": 320955 }, { "epoch": 15.00210309856522, - "grad_norm": 0.39815834164619446, - "learning_rate": 8.11191089559024e-05, - "loss": 2.0967, + "grad_norm": 0.3719761073589325, + "learning_rate": 8.111759357478405e-05, + "loss": 2.0962, "step": 321000 }, { "epoch": 15.048838622236762, - "grad_norm": 0.39639827609062195, - "learning_rate": 8.036217608728595e-05, - "loss": 2.0557, + "grad_norm": 0.38452646136283875, + "learning_rate": 8.035990301560842e-05, + "loss": 2.0553, "step": 322000 }, { "epoch": 15.095574145908305, - "grad_norm": 0.4057682752609253, - "learning_rate": 7.960448552811032e-05, - "loss": 2.0602, + "grad_norm": 0.4220745265483856, + "learning_rate": 7.960221245643278e-05, + "loss": 2.0599, "step": 323000 }, { "epoch": 15.142309669579848, - "grad_norm": 0.40997353196144104, - "learning_rate": 7.884679496893467e-05, - "loss": 2.063, + "grad_norm": 0.4033695459365845, + "learning_rate": 7.884527958781634e-05, + "loss": 2.0626, "step": 324000 }, { "epoch": 15.18904519325139, - "grad_norm": 0.3955094814300537, - "learning_rate": 7.808910440975905e-05, - "loss": 2.0665, + "grad_norm": 0.43598106503486633, + "learning_rate": 7.80875890286407e-05, + "loss": 2.0659, "step": 325000 }, { "epoch": 15.235780716922934, - "grad_norm": 0.43070030212402344, - "learning_rate": 7.733141385058341e-05, - "loss": 2.0696, + "grad_norm": 0.41840988397598267, + "learning_rate": 7.733065616002423e-05, + "loss": 2.0691, "step": 326000 }, { "epoch": 15.282516240594475, - "grad_norm": 0.42902061343193054, - "learning_rate": 7.657523867252612e-05, - "loss": 2.0727, + "grad_norm": 0.42530229687690735, + "learning_rate": 7.657296560084861e-05, + "loss": 2.0719, "step": 327000 }, { "epoch": 15.329251764266019, - "grad_norm": 0.42950913310050964, - "learning_rate": 7.58175481133505e-05, - "loss": 2.0748, + "grad_norm": 0.4081062078475952, + "learning_rate": 7.581527504167297e-05, + "loss": 2.0744, "step": 328000 }, { "epoch": 15.375987287937562, - "grad_norm": 0.4149567782878876, - "learning_rate": 7.505985755417486e-05, - "loss": 2.0752, + "grad_norm": 0.4081234335899353, + "learning_rate": 7.505834217305652e-05, + "loss": 2.0745, "step": 329000 }, { "epoch": 15.422722811609104, - "grad_norm": 0.4134847819805145, - "learning_rate": 7.430216699499924e-05, - "loss": 2.0752, + "grad_norm": 0.4203273355960846, + "learning_rate": 7.430065161388089e-05, + "loss": 2.0743, "step": 330000 }, { "epoch": 15.469458335280647, - "grad_norm": 0.3985021412372589, - "learning_rate": 7.354447643582361e-05, - "loss": 2.0776, + "grad_norm": 0.400919109582901, + "learning_rate": 7.354296105470525e-05, + "loss": 2.077, "step": 331000 }, { "epoch": 15.516193858952189, - "grad_norm": 0.4318609833717346, - "learning_rate": 7.278754356720715e-05, - "loss": 2.0797, + "grad_norm": 0.4510698616504669, + "learning_rate": 7.27860281860888e-05, + "loss": 2.0785, "step": 332000 }, { "epoch": 15.562929382623732, - "grad_norm": 0.4110203683376312, - "learning_rate": 7.202985300803152e-05, - "loss": 2.0811, + "grad_norm": 0.43255168199539185, + "learning_rate": 7.202833762691316e-05, + "loss": 2.0804, "step": 333000 }, { "epoch": 15.609664906295276, - "grad_norm": 0.423392653465271, - "learning_rate": 7.127292013941505e-05, - "loss": 2.0744, + "grad_norm": 0.4247725307941437, + "learning_rate": 7.12714047582967e-05, + "loss": 2.0743, "step": 334000 }, { "epoch": 15.656400429966817, - "grad_norm": 0.40932226181030273, - "learning_rate": 7.051522958023942e-05, - "loss": 2.0779, + "grad_norm": 0.4146491587162018, + "learning_rate": 7.051371419912107e-05, + "loss": 2.0774, "step": 335000 }, { "epoch": 15.70313595363836, - "grad_norm": 0.42987844347953796, - "learning_rate": 6.975829671162297e-05, - "loss": 2.0712, + "grad_norm": 0.4102727770805359, + "learning_rate": 6.975602363994544e-05, + "loss": 2.0708, "step": 336000 }, { "epoch": 15.749871477309902, - "grad_norm": 0.42654335498809814, - "learning_rate": 6.900060615244734e-05, - "loss": 2.076, + "grad_norm": 0.43857139348983765, + "learning_rate": 6.899909077132898e-05, + "loss": 2.0755, "step": 337000 }, { "epoch": 15.796607000981446, - "grad_norm": 0.45028138160705566, - "learning_rate": 6.82429155932717e-05, - "loss": 2.08, + "grad_norm": 0.4425510764122009, + "learning_rate": 6.824140021215335e-05, + "loss": 2.0797, "step": 338000 }, { "epoch": 15.84334252465299, - "grad_norm": 0.43925535678863525, - "learning_rate": 6.748598272465524e-05, - "loss": 2.0885, + "grad_norm": 0.40479570627212524, + "learning_rate": 6.74844673435369e-05, + "loss": 2.0879, "step": 339000 }, { "epoch": 15.890078048324531, - "grad_norm": 0.4209968149662018, - "learning_rate": 6.672904985603879e-05, - "loss": 2.0859, + "grad_norm": 0.4146506190299988, + "learning_rate": 6.672677678436126e-05, + "loss": 2.0858, "step": 340000 }, { "epoch": 15.936813571996074, - "grad_norm": 0.3998005986213684, - "learning_rate": 6.597135929686316e-05, - "loss": 2.0839, + "grad_norm": 0.3799484670162201, + "learning_rate": 6.596908622518562e-05, + "loss": 2.0837, "step": 341000 }, { "epoch": 15.983549095667616, - "grad_norm": 0.3756837844848633, - "learning_rate": 6.521366873768751e-05, - "loss": 2.0851, + "grad_norm": 0.35762977600097656, + "learning_rate": 6.521139566600999e-05, + "loss": 2.0845, "step": 342000 }, { "epoch": 16.0, - "eval_accuracy": 0.519193548162244, - "eval_loss": 2.4821929931640625, - "eval_runtime": 183.7167, - "eval_samples_per_second": 382.79, - "eval_steps_per_second": 5.982, + "eval_accuracy": 0.5191752019684518, + "eval_loss": 2.4827961921691895, + "eval_runtime": 140.201, + "eval_samples_per_second": 501.601, + "eval_steps_per_second": 7.839, "step": 342352 }, { "epoch": 16.03028461933916, - "grad_norm": 0.46294063329696655, - "learning_rate": 6.445597817851188e-05, - "loss": 2.0601, + "grad_norm": 0.47773468494415283, + "learning_rate": 6.445446279739354e-05, + "loss": 2.0599, "step": 343000 }, { "epoch": 16.077020143010703, - "grad_norm": 0.441850483417511, - "learning_rate": 6.369904530989543e-05, - "loss": 2.0523, + "grad_norm": 0.4322941303253174, + "learning_rate": 6.36967722382179e-05, + "loss": 2.0511, "step": 344000 }, { "epoch": 16.123755666682246, - "grad_norm": 0.468127578496933, - "learning_rate": 6.29413547507198e-05, - "loss": 2.0485, + "grad_norm": 0.4478399157524109, + "learning_rate": 6.293983936960145e-05, + "loss": 2.0478, "step": 345000 }, { "epoch": 16.170491190353786, - "grad_norm": 0.41471847891807556, - "learning_rate": 6.218366419154417e-05, - "loss": 2.0433, + "grad_norm": 0.4218432903289795, + "learning_rate": 6.218214881042582e-05, + "loss": 2.0431, "step": 346000 }, { "epoch": 16.21722671402533, - "grad_norm": 0.4540562927722931, - "learning_rate": 6.142673132292772e-05, - "loss": 2.0585, + "grad_norm": 0.46387240290641785, + "learning_rate": 6.142445825125018e-05, + "loss": 2.058, "step": 347000 }, { "epoch": 16.263962237696873, - "grad_norm": 0.4675719738006592, - "learning_rate": 6.066904076375207e-05, - "loss": 2.0585, + "grad_norm": 0.4464201033115387, + "learning_rate": 6.066752538263373e-05, + "loss": 2.0578, "step": 348000 }, { "epoch": 16.310697761368417, - "grad_norm": 0.43343010544776917, - "learning_rate": 5.991135020457644e-05, - "loss": 2.0519, + "grad_norm": 0.4614529013633728, + "learning_rate": 5.9909834823458096e-05, + "loss": 2.0514, "step": 349000 }, { "epoch": 16.35743328503996, - "grad_norm": 0.4093555808067322, - "learning_rate": 5.915365964540082e-05, - "loss": 2.0601, + "grad_norm": 0.4258309006690979, + "learning_rate": 5.915214426428246e-05, + "loss": 2.059, "step": 350000 }, { "epoch": 16.404168808711503, - "grad_norm": 0.4201803505420685, - "learning_rate": 5.8395969086225186e-05, + "grad_norm": 0.4466577172279358, + "learning_rate": 5.8395211395666006e-05, "loss": 2.059, "step": 351000 }, { "epoch": 16.450904332383043, - "grad_norm": 0.4299643337726593, - "learning_rate": 5.763903621760872e-05, - "loss": 2.0602, + "grad_norm": 0.43303975462913513, + "learning_rate": 5.7637520836490375e-05, + "loss": 2.0594, "step": 352000 }, { "epoch": 16.497639856054587, - "grad_norm": 0.42913755774497986, - "learning_rate": 5.688134565843309e-05, - "loss": 2.0529, + "grad_norm": 0.43263858556747437, + "learning_rate": 5.688058796787392e-05, + "loss": 2.0521, "step": 353000 }, { "epoch": 16.54437537972613, - "grad_norm": 0.39006081223487854, - "learning_rate": 5.612441278981663e-05, - "loss": 2.0703, + "grad_norm": 0.3774734139442444, + "learning_rate": 5.6122897408698286e-05, + "loss": 2.07, "step": 354000 }, { "epoch": 16.591110903397674, - "grad_norm": 0.4356352388858795, - "learning_rate": 5.536747992120018e-05, - "loss": 2.0618, + "grad_norm": 0.4208875000476837, + "learning_rate": 5.536596454008182e-05, + "loss": 2.0611, "step": 355000 }, { "epoch": 16.637846427069217, - "grad_norm": 0.4199896454811096, - "learning_rate": 5.4610547052583714e-05, - "loss": 2.0674, + "grad_norm": 0.42913100123405457, + "learning_rate": 5.460827398090619e-05, + "loss": 2.0665, "step": 356000 }, { "epoch": 16.684581950740757, - "grad_norm": 0.4299008846282959, - "learning_rate": 5.385285649340809e-05, - "loss": 2.0648, + "grad_norm": 0.43968814611434937, + "learning_rate": 5.3850583421730565e-05, + "loss": 2.0642, "step": 357000 }, { "epoch": 16.7313174744123, - "grad_norm": 0.4732346832752228, - "learning_rate": 5.309516593423246e-05, - "loss": 2.0708, + "grad_norm": 0.44484037160873413, + "learning_rate": 5.3092892862554934e-05, + "loss": 2.0699, "step": 358000 }, { "epoch": 16.778052998083844, - "grad_norm": 0.45810797810554504, - "learning_rate": 5.233747537505683e-05, - "loss": 2.0599, + "grad_norm": 0.44043654203414917, + "learning_rate": 5.233595999393847e-05, + "loss": 2.0594, "step": 359000 }, { "epoch": 16.824788521755387, - "grad_norm": 0.42283934354782104, - "learning_rate": 5.157978481588119e-05, - "loss": 2.0613, + "grad_norm": 0.4146668612957001, + "learning_rate": 5.157902712532202e-05, + "loss": 2.0609, "step": 360000 }, { "epoch": 16.87152404542693, - "grad_norm": 0.44432106614112854, - "learning_rate": 5.082209425670556e-05, - "loss": 2.0623, + "grad_norm": 0.4560825228691101, + "learning_rate": 5.082133656614638e-05, + "loss": 2.0619, "step": 361000 }, { "epoch": 16.91825956909847, - "grad_norm": 0.431477814912796, - "learning_rate": 5.006591907864827e-05, - "loss": 2.0676, + "grad_norm": 0.41832563281059265, + "learning_rate": 5.006364600697075e-05, + "loss": 2.067, "step": 362000 }, { "epoch": 16.964995092770014, - "grad_norm": 0.40297403931617737, - "learning_rate": 4.930822851947264e-05, - "loss": 2.0694, + "grad_norm": 0.4294453263282776, + "learning_rate": 4.9306713138354296e-05, + "loss": 2.0687, "step": 363000 }, { "epoch": 17.0, - "eval_accuracy": 0.519240556798617, - "eval_loss": 2.4870810508728027, - "eval_runtime": 184.4531, - "eval_samples_per_second": 381.262, - "eval_steps_per_second": 5.958, + "eval_accuracy": 0.5193157260060085, + "eval_loss": 2.484442949295044, + "eval_runtime": 139.8557, + "eval_samples_per_second": 502.84, + "eval_steps_per_second": 7.858, "step": 363749 }, { "epoch": 17.011730616441557, - "grad_norm": 0.42919251322746277, - "learning_rate": 4.855053796029701e-05, - "loss": 2.0591, + "grad_norm": 0.4160861670970917, + "learning_rate": 4.854902257917866e-05, + "loss": 2.0586, "step": 364000 }, { "epoch": 17.0584661401131, - "grad_norm": 0.4822104573249817, - "learning_rate": 4.779284740112138e-05, - "loss": 2.0356, + "grad_norm": 0.4748011827468872, + "learning_rate": 4.779208971056221e-05, + "loss": 2.0348, "step": 365000 }, { "epoch": 17.105201663784644, - "grad_norm": 0.44094109535217285, - "learning_rate": 4.703591453250492e-05, - "loss": 2.0376, + "grad_norm": 0.43394219875335693, + "learning_rate": 4.7034399151386575e-05, + "loss": 2.0372, "step": 366000 }, { "epoch": 17.151937187456184, - "grad_norm": 0.3879323899745941, - "learning_rate": 4.627822397332929e-05, - "loss": 2.0357, + "grad_norm": 0.42335084080696106, + "learning_rate": 4.627746628277011e-05, + "loss": 2.0354, "step": 367000 }, { "epoch": 17.198672711127728, - "grad_norm": 0.4346221387386322, + "grad_norm": 0.4134625792503357, "learning_rate": 4.552053341415366e-05, - "loss": 2.0424, + "loss": 2.042, "step": 368000 }, { "epoch": 17.24540823479927, - "grad_norm": 0.42520612478256226, + "grad_norm": 0.4275600016117096, "learning_rate": 4.476284285497802e-05, - "loss": 2.0422, + "loss": 2.0418, "step": 369000 }, { "epoch": 17.292143758470814, - "grad_norm": 0.3917822241783142, + "grad_norm": 0.39555883407592773, "learning_rate": 4.400515229580239e-05, - "loss": 2.0312, + "loss": 2.0306, "step": 370000 }, { "epoch": 17.338879282142358, - "grad_norm": 0.4468395411968231, - "learning_rate": 4.3248977117745104e-05, - "loss": 2.0453, + "grad_norm": 0.4214068055152893, + "learning_rate": 4.324746173662676e-05, + "loss": 2.0448, "step": 371000 }, { "epoch": 17.385614805813898, - "grad_norm": 0.4312225580215454, - "learning_rate": 4.249128655856948e-05, - "loss": 2.0439, + "grad_norm": 0.47871068120002747, + "learning_rate": 4.24905288680103e-05, + "loss": 2.0431, "step": 372000 }, { "epoch": 17.43235032948544, - "grad_norm": 0.4082895815372467, - "learning_rate": 4.173359599939385e-05, - "loss": 2.0425, + "grad_norm": 0.412401407957077, + "learning_rate": 4.173283830883467e-05, + "loss": 2.0419, "step": 373000 }, { "epoch": 17.479085853156985, - "grad_norm": 0.4173845052719116, - "learning_rate": 4.097590544021822e-05, - "loss": 2.0478, + "grad_norm": 0.4188598096370697, + "learning_rate": 4.097514774965904e-05, + "loss": 2.0472, "step": 374000 }, { "epoch": 17.525821376828528, - "grad_norm": 0.47007617354393005, - "learning_rate": 4.021897257160175e-05, - "loss": 2.0374, + "grad_norm": 0.4559321403503418, + "learning_rate": 4.021821488104258e-05, + "loss": 2.0371, "step": 375000 }, { "epoch": 17.57255690050007, - "grad_norm": 0.45107054710388184, - "learning_rate": 3.94620397029853e-05, - "loss": 2.0552, + "grad_norm": 0.43533673882484436, + "learning_rate": 3.946128201242612e-05, + "loss": 2.0546, "step": 376000 }, { "epoch": 17.61929242417161, - "grad_norm": 0.43024107813835144, - "learning_rate": 3.870510683436884e-05, - "loss": 2.0489, + "grad_norm": 0.43328747153282166, + "learning_rate": 3.870359145325049e-05, + "loss": 2.0478, "step": 377000 }, { "epoch": 17.666027947843155, - "grad_norm": 0.40413206815719604, - "learning_rate": 3.794741627519321e-05, - "loss": 2.0532, + "grad_norm": 0.42844444513320923, + "learning_rate": 3.794590089407485e-05, + "loss": 2.0524, "step": 378000 }, { "epoch": 17.712763471514698, - "grad_norm": 0.4514461159706116, - "learning_rate": 3.718972571601757e-05, - "loss": 2.0473, + "grad_norm": 0.44369402527809143, + "learning_rate": 3.718821033489922e-05, + "loss": 2.0466, "step": 379000 }, { "epoch": 17.75949899518624, - "grad_norm": 0.44332849979400635, - "learning_rate": 3.643203515684194e-05, - "loss": 2.053, + "grad_norm": 0.48059558868408203, + "learning_rate": 3.6430519775723596e-05, + "loss": 2.0529, "step": 380000 }, { "epoch": 17.806234518857785, - "grad_norm": 0.431598424911499, - "learning_rate": 3.567434459766631e-05, - "loss": 2.048, + "grad_norm": 0.4353107511997223, + "learning_rate": 3.567282921654796e-05, + "loss": 2.047, "step": 381000 }, { "epoch": 17.852970042529325, - "grad_norm": 0.4604801833629608, - "learning_rate": 3.491741172904985e-05, - "loss": 2.0541, + "grad_norm": 0.46026790142059326, + "learning_rate": 3.49158963479315e-05, + "loss": 2.0536, "step": 382000 }, { "epoch": 17.89970556620087, - "grad_norm": 0.4291286766529083, + "grad_norm": 0.4381290674209595, "learning_rate": 3.415972116987422e-05, - "loss": 2.0486, + "loss": 2.0475, "step": 383000 }, { "epoch": 17.946441089872412, - "grad_norm": 0.46012356877326965, + "grad_norm": 0.43139737844467163, "learning_rate": 3.340203061069859e-05, - "loss": 2.0515, + "loss": 2.0513, "step": 384000 }, { "epoch": 17.993176613543955, - "grad_norm": 0.44041863083839417, - "learning_rate": 3.264509774208213e-05, - "loss": 2.0582, + "grad_norm": 0.4275793731212616, + "learning_rate": 3.264434005152295e-05, + "loss": 2.0578, "step": 385000 }, { "epoch": 18.0, - "eval_accuracy": 0.5194157657375073, - "eval_loss": 2.489042043685913, - "eval_runtime": 184.3996, - "eval_samples_per_second": 381.373, - "eval_steps_per_second": 5.96, + "eval_accuracy": 0.5193356335779957, + "eval_loss": 2.4892284870147705, + "eval_runtime": 140.4232, + "eval_samples_per_second": 500.808, + "eval_steps_per_second": 7.826, "step": 385146 }, { "epoch": 18.0399121372155, - "grad_norm": 0.4347294270992279, - "learning_rate": 3.1887407182906494e-05, - "loss": 2.026, + "grad_norm": 0.43875226378440857, + "learning_rate": 3.188664949234732e-05, + "loss": 2.025, "step": 386000 }, { "epoch": 18.08664766088704, - "grad_norm": 0.4946117699146271, - "learning_rate": 3.113047431429004e-05, - "loss": 2.0199, + "grad_norm": 0.5065541863441467, + "learning_rate": 3.112895893317169e-05, + "loss": 2.0194, "step": 387000 }, { "epoch": 18.133383184558582, - "grad_norm": 0.4774208068847656, - "learning_rate": 3.0372783755114407e-05, - "loss": 2.0277, + "grad_norm": 0.4623928666114807, + "learning_rate": 3.037202606455523e-05, + "loss": 2.0275, "step": 388000 }, { "epoch": 18.180118708230125, - "grad_norm": 0.42027777433395386, - "learning_rate": 2.9615093195938773e-05, - "loss": 2.027, + "grad_norm": 0.4558030366897583, + "learning_rate": 2.96143355053796e-05, + "loss": 2.0265, "step": 389000 }, { "epoch": 18.22685423190167, - "grad_norm": 0.47702670097351074, - "learning_rate": 2.8857402636763145e-05, - "loss": 2.0251, + "grad_norm": 0.4751470685005188, + "learning_rate": 2.8856644946203966e-05, + "loss": 2.0245, "step": 390000 }, { "epoch": 18.273589755573212, - "grad_norm": 0.45635542273521423, - "learning_rate": 2.8100469768146687e-05, - "loss": 2.0282, + "grad_norm": 0.4706648886203766, + "learning_rate": 2.809971207758751e-05, + "loss": 2.0275, "step": 391000 }, { "epoch": 18.320325279244756, - "grad_norm": 0.42526108026504517, - "learning_rate": 2.7343536899530232e-05, - "loss": 2.0332, + "grad_norm": 0.4196698069572449, + "learning_rate": 2.734202151841188e-05, + "loss": 2.0325, "step": 392000 }, { "epoch": 18.367060802916296, - "grad_norm": 0.4558388292789459, - "learning_rate": 2.6585846340354594e-05, - "loss": 2.0351, + "grad_norm": 0.47690829634666443, + "learning_rate": 2.6584330959236245e-05, + "loss": 2.0342, "step": 393000 }, { "epoch": 18.41379632658784, - "grad_norm": 0.42901089787483215, - "learning_rate": 2.5828155781178966e-05, - "loss": 2.0279, + "grad_norm": 0.4510248899459839, + "learning_rate": 2.5826640400060614e-05, + "loss": 2.0278, "step": 394000 }, { "epoch": 18.460531850259382, - "grad_norm": 0.4340655207633972, - "learning_rate": 2.5071222912562508e-05, - "loss": 2.0378, + "grad_norm": 0.44831424951553345, + "learning_rate": 2.506894984088498e-05, + "loss": 2.0372, "step": 395000 }, { "epoch": 18.507267373930926, - "grad_norm": 0.4176914095878601, - "learning_rate": 2.4313532353386873e-05, - "loss": 2.0284, + "grad_norm": 0.4107140600681305, + "learning_rate": 2.43127746628277e-05, + "loss": 2.0282, "step": 396000 }, { "epoch": 18.55400289760247, - "grad_norm": 0.44839930534362793, - "learning_rate": 2.3556599484770418e-05, + "grad_norm": 0.42058345675468445, + "learning_rate": 2.3555084103652066e-05, "loss": 2.029, "step": 397000 }, { "epoch": 18.60073842127401, - "grad_norm": 0.44085559248924255, - "learning_rate": 2.2798908925594787e-05, - "loss": 2.035, + "grad_norm": 0.4265553653240204, + "learning_rate": 2.2797393544476435e-05, + "loss": 2.0345, "step": 398000 }, { "epoch": 18.647473944945553, - "grad_norm": 0.4037966728210449, - "learning_rate": 2.2041218366419152e-05, - "loss": 2.0359, + "grad_norm": 0.40311095118522644, + "learning_rate": 2.2040460675859976e-05, + "loss": 2.0349, "step": 399000 }, { "epoch": 18.694209468617096, - "grad_norm": 0.4513247311115265, - "learning_rate": 2.128352780724352e-05, - "loss": 2.0357, + "grad_norm": 0.475462406873703, + "learning_rate": 2.1282770116684342e-05, + "loss": 2.0345, "step": 400000 }, { "epoch": 18.74094499228864, - "grad_norm": 0.4081607162952423, + "grad_norm": 0.39591309428215027, "learning_rate": 2.0525837248067887e-05, - "loss": 2.0394, + "loss": 2.039, "step": 401000 }, { "epoch": 18.787680515960183, - "grad_norm": 0.42668506503105164, - "learning_rate": 1.976890437945143e-05, - "loss": 2.0359, + "grad_norm": 0.39184555411338806, + "learning_rate": 1.9768146688892256e-05, + "loss": 2.0356, "step": 402000 }, { "epoch": 18.834416039631723, - "grad_norm": 0.476225346326828, - "learning_rate": 1.9011971510834973e-05, - "loss": 2.0299, + "grad_norm": 0.48192140460014343, + "learning_rate": 1.901045612971662e-05, + "loss": 2.0292, "step": 403000 }, { "epoch": 18.881151563303266, - "grad_norm": 0.4597811698913574, - "learning_rate": 1.825428095165934e-05, - "loss": 2.0309, + "grad_norm": 0.4821534752845764, + "learning_rate": 1.825276557054099e-05, + "loss": 2.0301, "step": 404000 }, { "epoch": 18.92788708697481, - "grad_norm": 0.4811764359474182, - "learning_rate": 1.7497348083042884e-05, - "loss": 2.038, + "grad_norm": 0.49665161967277527, + "learning_rate": 1.7496590392483708e-05, + "loss": 2.0373, "step": 405000 }, { "epoch": 18.974622610646353, - "grad_norm": 0.4466439485549927, - "learning_rate": 1.673965752386725e-05, - "loss": 2.0422, + "grad_norm": 0.47178831696510315, + "learning_rate": 1.6738899833308073e-05, + "loss": 2.0413, "step": 406000 }, { "epoch": 19.0, - "eval_accuracy": 0.5193626231153677, - "eval_loss": 2.4923205375671387, - "eval_runtime": 184.7287, - "eval_samples_per_second": 380.694, - "eval_steps_per_second": 5.949, + "eval_accuracy": 0.5193102611823257, + "eval_loss": 2.4917633533477783, + "eval_runtime": 140.0603, + "eval_samples_per_second": 502.105, + "eval_steps_per_second": 7.847, "step": 406543 }, { "epoch": 19.021358134317897, - "grad_norm": 0.44981294870376587, - "learning_rate": 1.5981966964691618e-05, - "loss": 2.0278, + "grad_norm": 0.4976256489753723, + "learning_rate": 1.5981209274132442e-05, + "loss": 2.0271, "step": 407000 }, { "epoch": 19.068093657989436, - "grad_norm": 0.41826167702674866, - "learning_rate": 1.5225034096075161e-05, - "loss": 2.0176, + "grad_norm": 0.42240777611732483, + "learning_rate": 1.522351871495681e-05, + "loss": 2.0166, "step": 408000 }, { "epoch": 19.11482918166098, - "grad_norm": 0.5001415014266968, - "learning_rate": 1.4467343536899529e-05, - "loss": 2.0121, + "grad_norm": 0.502845287322998, + "learning_rate": 1.4465828155781176e-05, + "loss": 2.0114, "step": 409000 }, { "epoch": 19.161564705332523, - "grad_norm": 0.47413039207458496, - "learning_rate": 1.3709652977723896e-05, - "loss": 2.0172, + "grad_norm": 0.48909902572631836, + "learning_rate": 1.3708137596605545e-05, + "loss": 2.0167, "step": 410000 }, { "epoch": 19.208300229004067, - "grad_norm": 0.4496116638183594, - "learning_rate": 1.2951962418548263e-05, - "loss": 2.0206, + "grad_norm": 0.43516916036605835, + "learning_rate": 1.2951204727989087e-05, + "loss": 2.0198, "step": 411000 }, { "epoch": 19.25503575267561, - "grad_norm": 0.413718581199646, - "learning_rate": 1.2195029549931806e-05, - "loss": 2.0237, + "grad_norm": 0.4162134528160095, + "learning_rate": 1.219427185937263e-05, + "loss": 2.023, "step": 412000 }, { "epoch": 19.30177127634715, - "grad_norm": 0.4080525040626526, - "learning_rate": 1.1437338990756173e-05, - "loss": 2.0204, + "grad_norm": 0.4125491976737976, + "learning_rate": 1.1436581300196997e-05, + "loss": 2.0195, "step": 413000 }, { "epoch": 19.348506800018693, - "grad_norm": 0.4642179608345032, - "learning_rate": 1.0680406122139717e-05, - "loss": 2.0251, + "grad_norm": 0.45884060859680176, + "learning_rate": 1.067964843158054e-05, + "loss": 2.0244, "step": 414000 }, { "epoch": 19.395242323690237, - "grad_norm": 0.43880292773246765, - "learning_rate": 9.922715562964084e-06, - "loss": 2.0234, + "grad_norm": 0.44142866134643555, + "learning_rate": 9.92195787240491e-06, + "loss": 2.0226, "step": 415000 }, { "epoch": 19.44197784736178, - "grad_norm": 0.43078604340553284, - "learning_rate": 9.165025003788451e-06, + "grad_norm": 0.4506835639476776, + "learning_rate": 9.164267313229277e-06, "loss": 2.0157, "step": 416000 }, { "epoch": 19.488713371033324, - "grad_norm": 0.4106679856777191, - "learning_rate": 8.408092135171994e-06, - "loss": 2.0227, + "grad_norm": 0.4037749171257019, + "learning_rate": 8.406576754053644e-06, + "loss": 2.0217, "step": 417000 }, { "epoch": 19.535448894704864, - "grad_norm": 0.40442559123039246, - "learning_rate": 7.650401575996361e-06, - "loss": 2.0226, + "grad_norm": 0.4104039669036865, + "learning_rate": 7.649643885437187e-06, + "loss": 2.0217, "step": 418000 }, { "epoch": 19.582184418376407, - "grad_norm": 0.43674561381340027, - "learning_rate": 6.8934687073799055e-06, - "loss": 2.0116, + "grad_norm": 0.4536112844944, + "learning_rate": 6.891953326261554e-06, + "loss": 2.0111, "step": 419000 }, { "epoch": 19.62891994204795, - "grad_norm": 0.40172746777534485, - "learning_rate": 6.135778148204273e-06, - "loss": 2.0143, + "grad_norm": 0.4335053861141205, + "learning_rate": 6.134262767085921e-06, + "loss": 2.0136, "step": 420000 }, { "epoch": 19.675655465719494, - "grad_norm": 0.45278018712997437, - "learning_rate": 5.37808758902864e-06, - "loss": 2.0212, + "grad_norm": 0.49236640334129333, + "learning_rate": 5.377329898469465e-06, + "loss": 2.021, "step": 421000 }, { "epoch": 19.722390989391037, - "grad_norm": 0.45401936769485474, - "learning_rate": 4.621154720412183e-06, - "loss": 2.0186, + "grad_norm": 0.4698732793331146, + "learning_rate": 4.619639339293832e-06, + "loss": 2.018, "step": 422000 }, { "epoch": 19.769126513062577, - "grad_norm": 0.45410868525505066, - "learning_rate": 3.86346416123655e-06, - "loss": 2.0216, + "grad_norm": 0.4247852861881256, + "learning_rate": 3.861948780118199e-06, + "loss": 2.0213, "step": 423000 }, { "epoch": 19.81586203673412, - "grad_norm": 0.44420313835144043, - "learning_rate": 3.1065312926200936e-06, - "loss": 2.0232, + "grad_norm": 0.4361983835697174, + "learning_rate": 3.104258220942567e-06, + "loss": 2.023, "step": 424000 }, { "epoch": 19.862597560405664, - "grad_norm": 0.43311986327171326, - "learning_rate": 2.349598424003637e-06, - "loss": 2.0206, + "grad_norm": 0.43566909432411194, + "learning_rate": 2.34732535232611e-06, + "loss": 2.0198, "step": 425000 }, { "epoch": 19.909333084077208, - "grad_norm": 0.40267816185951233, - "learning_rate": 1.591907864828004e-06, - "loss": 2.021, + "grad_norm": 0.4237551987171173, + "learning_rate": 1.5903924837096529e-06, + "loss": 2.0206, "step": 426000 }, { "epoch": 19.95606860774875, - "grad_norm": 0.4614889621734619, - "learning_rate": 8.342173056523715e-07, - "loss": 2.0188, + "grad_norm": 0.4469325840473175, + "learning_rate": 8.327019245340203e-07, + "loss": 2.0185, "step": 427000 }, { "epoch": 20.0, - "eval_accuracy": 0.5192785875103685, - "eval_loss": 2.4960787296295166, - "eval_runtime": 184.2593, - "eval_samples_per_second": 381.663, - "eval_steps_per_second": 5.964, + "eval_accuracy": 0.5192642005255711, + "eval_loss": 2.495042085647583, + "eval_runtime": 140.3879, + "eval_samples_per_second": 500.933, + "eval_steps_per_second": 7.828, "step": 427940 }, { "epoch": 20.0, "step": 427940, "total_flos": 1.7890534785024e+18, - "train_loss": 2.2675810582360225, - "train_runtime": 92416.4259, - "train_samples_per_second": 148.176, - "train_steps_per_second": 4.631 + "train_loss": 2.267101509945356, + "train_runtime": 54453.6513, + "train_samples_per_second": 251.478, + "train_steps_per_second": 7.859 } ], "logging_steps": 1000,