{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 427940, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04673552367154274, "grad_norm": 0.8248543739318848, "learning_rate": 9.375e-06, "loss": 6.1849, "step": 1000 }, { "epoch": 0.09347104734308548, "grad_norm": 1.0290669202804565, "learning_rate": 1.875e-05, "loss": 4.4262, "step": 2000 }, { "epoch": 0.14020657101462822, "grad_norm": 0.9003722667694092, "learning_rate": 2.8125e-05, "loss": 4.0814, "step": 3000 }, { "epoch": 0.18694209468617096, "grad_norm": 1.0617480278015137, "learning_rate": 3.75e-05, "loss": 3.8945, "step": 4000 }, { "epoch": 0.2336776183577137, "grad_norm": 0.9939677715301514, "learning_rate": 4.6874999999999994e-05, "loss": 3.7156, "step": 5000 }, { "epoch": 0.28041314202925643, "grad_norm": 0.9459012746810913, "learning_rate": 5.625e-05, "loss": 3.5923, "step": 6000 }, { "epoch": 0.3271486657007992, "grad_norm": 1.051654577255249, "learning_rate": 6.5625e-05, "loss": 3.4775, "step": 7000 }, { "epoch": 0.3738841893723419, "grad_norm": 0.8787893652915955, "learning_rate": 7.5e-05, "loss": 3.3928, "step": 8000 }, { "epoch": 0.4206197130438847, "grad_norm": 0.8886988162994385, "learning_rate": 8.437499999999999e-05, "loss": 3.3159, "step": 9000 }, { "epoch": 0.4673552367154274, "grad_norm": 0.8495904803276062, "learning_rate": 9.374999999999999e-05, "loss": 3.2375, "step": 10000 }, { "epoch": 0.5140907603869701, "grad_norm": 0.8732250928878784, "learning_rate": 0.00010312499999999999, "loss": 3.1634, "step": 11000 }, { "epoch": 0.5608262840585129, "grad_norm": 0.8345146775245667, "learning_rate": 0.00011248124999999999, "loss": 3.0978, "step": 12000 }, { "epoch": 0.6075618077300556, "grad_norm": 0.8283113241195679, "learning_rate": 0.00012185624999999998, "loss": 3.0404, "step": 13000 }, { "epoch": 0.6542973314015984, "grad_norm": 0.8576833605766296, "learning_rate": 0.000131221875, "loss": 2.994, "step": 14000 }, { "epoch": 0.7010328550731411, "grad_norm": 0.8024506568908691, "learning_rate": 0.000140596875, "loss": 2.9562, "step": 15000 }, { "epoch": 0.7477683787446838, "grad_norm": 0.7570227384567261, "learning_rate": 0.0001499625, "loss": 2.9387, "step": 16000 }, { "epoch": 0.7945039024162266, "grad_norm": 0.7729004621505737, "learning_rate": 0.00015933749999999996, "loss": 2.9017, "step": 17000 }, { "epoch": 0.8412394260877694, "grad_norm": 0.7341739535331726, "learning_rate": 0.00016871249999999996, "loss": 2.8696, "step": 18000 }, { "epoch": 0.887974949759312, "grad_norm": 0.6923528909683228, "learning_rate": 0.000178078125, "loss": 2.8418, "step": 19000 }, { "epoch": 0.9347104734308548, "grad_norm": 0.6515923738479614, "learning_rate": 0.00018743437499999996, "loss": 2.8301, "step": 20000 }, { "epoch": 0.9814459971023975, "grad_norm": 0.6980977654457092, "learning_rate": 0.00019680937499999996, "loss": 2.8033, "step": 21000 }, { "epoch": 1.0, "eval_accuracy": 0.4671583335075943, "eval_loss": 2.906492233276367, "eval_runtime": 181.9195, "eval_samples_per_second": 386.572, "eval_steps_per_second": 6.041, "step": 21397 }, { "epoch": 1.0281815207739402, "grad_norm": 0.6191843748092651, "learning_rate": 0.00020618437499999995, "loss": 2.7769, "step": 22000 }, { "epoch": 1.074917044445483, "grad_norm": 0.6327579021453857, "learning_rate": 0.00021555937499999998, "loss": 2.7631, "step": 23000 }, { "epoch": 1.1216525681170257, "grad_norm": 0.5806933641433716, "learning_rate": 0.00022493437499999998, "loss": 2.7601, "step": 24000 }, { "epoch": 1.1683880917885685, "grad_norm": 0.6323985457420349, "learning_rate": 0.00023429999999999998, "loss": 2.7296, "step": 25000 }, { "epoch": 1.2151236154601113, "grad_norm": 0.5768846869468689, "learning_rate": 0.00024367499999999997, "loss": 2.7258, "step": 26000 }, { "epoch": 1.261859139131654, "grad_norm": 0.5394349694252014, "learning_rate": 0.00025305, "loss": 2.7186, "step": 27000 }, { "epoch": 1.3085946628031966, "grad_norm": 0.5966582298278809, "learning_rate": 0.000262415625, "loss": 2.7106, "step": 28000 }, { "epoch": 1.3553301864747396, "grad_norm": 0.5289971232414246, "learning_rate": 0.000271790625, "loss": 2.7075, "step": 29000 }, { "epoch": 1.4020657101462821, "grad_norm": 0.5028895139694214, "learning_rate": 0.00028115624999999994, "loss": 2.6847, "step": 30000 }, { "epoch": 1.4488012338178249, "grad_norm": 0.46625378727912903, "learning_rate": 0.00029053124999999994, "loss": 2.6701, "step": 31000 }, { "epoch": 1.4955367574893677, "grad_norm": 0.47429364919662476, "learning_rate": 0.00029990624999999993, "loss": 2.6642, "step": 32000 }, { "epoch": 1.5422722811609104, "grad_norm": 0.4630049765110016, "learning_rate": 0.0002992498863464161, "loss": 2.6493, "step": 33000 }, { "epoch": 1.5890078048324532, "grad_norm": 0.4727250933647156, "learning_rate": 0.0002984929534777996, "loss": 2.6392, "step": 34000 }, { "epoch": 1.6357433285039957, "grad_norm": 0.42802268266677856, "learning_rate": 0.000297735262918624, "loss": 2.6413, "step": 35000 }, { "epoch": 1.6824788521755387, "grad_norm": 0.46135851740837097, "learning_rate": 0.00029697833005000755, "loss": 2.6263, "step": 36000 }, { "epoch": 1.7292143758470813, "grad_norm": 0.41609084606170654, "learning_rate": 0.0002962206394908319, "loss": 2.6188, "step": 37000 }, { "epoch": 1.7759498995186243, "grad_norm": 0.4131929278373718, "learning_rate": 0.00029546294893165626, "loss": 2.6028, "step": 38000 }, { "epoch": 1.8226854231901668, "grad_norm": 0.42282310128211975, "learning_rate": 0.00029470601606303983, "loss": 2.5922, "step": 39000 }, { "epoch": 1.8694209468617096, "grad_norm": 0.37517085671424866, "learning_rate": 0.0002939483255038642, "loss": 2.5966, "step": 40000 }, { "epoch": 1.9161564705332523, "grad_norm": 0.3877894878387451, "learning_rate": 0.00029319139263524777, "loss": 2.5769, "step": 41000 }, { "epoch": 1.962891994204795, "grad_norm": 0.3990887999534607, "learning_rate": 0.0002924344597666313, "loss": 2.57, "step": 42000 }, { "epoch": 2.0, "eval_accuracy": 0.4900495319350634, "eval_loss": 2.6932899951934814, "eval_runtime": 184.3808, "eval_samples_per_second": 381.412, "eval_steps_per_second": 5.96, "step": 42794 }, { "epoch": 2.009627517876338, "grad_norm": 0.3544236421585083, "learning_rate": 0.00029167676920745564, "loss": 2.5606, "step": 43000 }, { "epoch": 2.0563630415478804, "grad_norm": 0.39064693450927734, "learning_rate": 0.00029091907864828, "loss": 2.5214, "step": 44000 }, { "epoch": 2.1030985652194234, "grad_norm": 0.39446505904197693, "learning_rate": 0.0002901613880891044, "loss": 2.5125, "step": 45000 }, { "epoch": 2.149834088890966, "grad_norm": 0.4003261923789978, "learning_rate": 0.00028940369752992876, "loss": 2.5217, "step": 46000 }, { "epoch": 2.196569612562509, "grad_norm": 0.38740643858909607, "learning_rate": 0.0002886467646613123, "loss": 2.5115, "step": 47000 }, { "epoch": 2.2433051362340515, "grad_norm": 0.36951524019241333, "learning_rate": 0.0002878898317926958, "loss": 2.5228, "step": 48000 }, { "epoch": 2.2900406599055945, "grad_norm": 0.3615242540836334, "learning_rate": 0.0002871321412335202, "loss": 2.5062, "step": 49000 }, { "epoch": 2.336776183577137, "grad_norm": 0.3720042407512665, "learning_rate": 0.00028637445067434457, "loss": 2.4979, "step": 50000 }, { "epoch": 2.3835117072486796, "grad_norm": 0.3349687457084656, "learning_rate": 0.0002856175178057281, "loss": 2.5054, "step": 51000 }, { "epoch": 2.4302472309202225, "grad_norm": 0.354491263628006, "learning_rate": 0.0002848598272465525, "loss": 2.4955, "step": 52000 }, { "epoch": 2.476982754591765, "grad_norm": 0.36612212657928467, "learning_rate": 0.00028410213668737686, "loss": 2.4862, "step": 53000 }, { "epoch": 2.523718278263308, "grad_norm": 0.364339679479599, "learning_rate": 0.0002833444461282012, "loss": 2.4889, "step": 54000 }, { "epoch": 2.5704538019348506, "grad_norm": 0.34728386998176575, "learning_rate": 0.0002825875132595848, "loss": 2.4912, "step": 55000 }, { "epoch": 2.617189325606393, "grad_norm": 0.3255521059036255, "learning_rate": 0.00028182982270040914, "loss": 2.48, "step": 56000 }, { "epoch": 2.663924849277936, "grad_norm": 0.36621731519699097, "learning_rate": 0.0002810721321412335, "loss": 2.4846, "step": 57000 }, { "epoch": 2.710660372949479, "grad_norm": 0.37520626187324524, "learning_rate": 0.00028031519927261707, "loss": 2.4712, "step": 58000 }, { "epoch": 2.7573958966210217, "grad_norm": 0.3485676348209381, "learning_rate": 0.0002795582664040006, "loss": 2.477, "step": 59000 }, { "epoch": 2.8041314202925642, "grad_norm": 0.33132848143577576, "learning_rate": 0.00027880057584482495, "loss": 2.479, "step": 60000 }, { "epoch": 2.8508669439641072, "grad_norm": 0.3298667371273041, "learning_rate": 0.0002780428852856493, "loss": 2.4676, "step": 61000 }, { "epoch": 2.8976024676356498, "grad_norm": 0.3598766624927521, "learning_rate": 0.0002772851947264737, "loss": 2.4698, "step": 62000 }, { "epoch": 2.9443379913071928, "grad_norm": 0.34787800908088684, "learning_rate": 0.00027652826185785724, "loss": 2.451, "step": 63000 }, { "epoch": 2.9910735149787353, "grad_norm": 0.3385255038738251, "learning_rate": 0.0002757705712986816, "loss": 2.4599, "step": 64000 }, { "epoch": 3.0, "eval_accuracy": 0.5007376675519123, "eval_loss": 2.6011228561401367, "eval_runtime": 183.7613, "eval_samples_per_second": 382.697, "eval_steps_per_second": 5.981, "step": 64191 }, { "epoch": 3.0378090386502783, "grad_norm": 0.33664873242378235, "learning_rate": 0.0002750136384300651, "loss": 2.4104, "step": 65000 }, { "epoch": 3.084544562321821, "grad_norm": 0.3183669149875641, "learning_rate": 0.00027425594787088947, "loss": 2.4087, "step": 66000 }, { "epoch": 3.1312800859933634, "grad_norm": 0.3450115919113159, "learning_rate": 0.00027349901500227304, "loss": 2.4053, "step": 67000 }, { "epoch": 3.1780156096649064, "grad_norm": 0.3532576560974121, "learning_rate": 0.0002727413244430974, "loss": 2.405, "step": 68000 }, { "epoch": 3.224751133336449, "grad_norm": 0.356446772813797, "learning_rate": 0.00027198363388392175, "loss": 2.4053, "step": 69000 }, { "epoch": 3.271486657007992, "grad_norm": 0.30622342228889465, "learning_rate": 0.00027122670101530533, "loss": 2.4025, "step": 70000 }, { "epoch": 3.3182221806795344, "grad_norm": 0.3361418545246124, "learning_rate": 0.0002704690104561297, "loss": 2.4051, "step": 71000 }, { "epoch": 3.3649577043510774, "grad_norm": 0.31644776463508606, "learning_rate": 0.00026971131989695404, "loss": 2.4087, "step": 72000 }, { "epoch": 3.41169322802262, "grad_norm": 0.35094141960144043, "learning_rate": 0.0002689543870283376, "loss": 2.3977, "step": 73000 }, { "epoch": 3.4584287516941625, "grad_norm": 0.36921316385269165, "learning_rate": 0.00026819669646916197, "loss": 2.4109, "step": 74000 }, { "epoch": 3.5051642753657055, "grad_norm": 0.32217535376548767, "learning_rate": 0.00026743976360054555, "loss": 2.4036, "step": 75000 }, { "epoch": 3.551899799037248, "grad_norm": 0.34006989002227783, "learning_rate": 0.00026668283073192907, "loss": 2.4057, "step": 76000 }, { "epoch": 3.598635322708791, "grad_norm": 0.3229130506515503, "learning_rate": 0.0002659251401727534, "loss": 2.4056, "step": 77000 }, { "epoch": 3.6453708463803336, "grad_norm": 0.3431253433227539, "learning_rate": 0.0002651674496135778, "loss": 2.397, "step": 78000 }, { "epoch": 3.6921063700518766, "grad_norm": 0.3360424041748047, "learning_rate": 0.00026440975905440213, "loss": 2.3908, "step": 79000 }, { "epoch": 3.738841893723419, "grad_norm": 0.30646803975105286, "learning_rate": 0.00026365206849522654, "loss": 2.3939, "step": 80000 }, { "epoch": 3.785577417394962, "grad_norm": 0.35814180970191956, "learning_rate": 0.0002628943779360509, "loss": 2.3977, "step": 81000 }, { "epoch": 3.8323129410665047, "grad_norm": 0.3155287206172943, "learning_rate": 0.0002621382027579936, "loss": 2.3997, "step": 82000 }, { "epoch": 3.879048464738047, "grad_norm": 0.33178508281707764, "learning_rate": 0.000261380512198818, "loss": 2.3908, "step": 83000 }, { "epoch": 3.92578398840959, "grad_norm": 0.32515403628349304, "learning_rate": 0.00026062282163964235, "loss": 2.3905, "step": 84000 }, { "epoch": 3.9725195120811327, "grad_norm": 0.3208604156970978, "learning_rate": 0.00025986588877102587, "loss": 2.3967, "step": 85000 }, { "epoch": 4.0, "eval_accuracy": 0.5063414538940354, "eval_loss": 2.5534298419952393, "eval_runtime": 183.4478, "eval_samples_per_second": 383.352, "eval_steps_per_second": 5.991, "step": 85588 }, { "epoch": 4.019255035752676, "grad_norm": 0.3112667202949524, "learning_rate": 0.0002591081982118503, "loss": 2.3708, "step": 86000 }, { "epoch": 4.065990559424218, "grad_norm": 0.33806201815605164, "learning_rate": 0.00025835050765267464, "loss": 2.3319, "step": 87000 }, { "epoch": 4.112726083095761, "grad_norm": 0.33769309520721436, "learning_rate": 0.00025759357478405816, "loss": 2.3438, "step": 88000 }, { "epoch": 4.159461606767304, "grad_norm": 0.3399094343185425, "learning_rate": 0.00025683664191544173, "loss": 2.3478, "step": 89000 }, { "epoch": 4.206197130438847, "grad_norm": 0.3605833053588867, "learning_rate": 0.0002560789513562661, "loss": 2.3403, "step": 90000 }, { "epoch": 4.252932654110389, "grad_norm": 0.35199499130249023, "learning_rate": 0.00025532126079709044, "loss": 2.3451, "step": 91000 }, { "epoch": 4.299668177781932, "grad_norm": 0.3519517183303833, "learning_rate": 0.00025456357023791485, "loss": 2.3433, "step": 92000 }, { "epoch": 4.346403701453474, "grad_norm": 0.3213213086128235, "learning_rate": 0.0002538058796787392, "loss": 2.3399, "step": 93000 }, { "epoch": 4.393139225125018, "grad_norm": 0.3580070436000824, "learning_rate": 0.00025304894681012273, "loss": 2.3463, "step": 94000 }, { "epoch": 4.43987474879656, "grad_norm": 0.338870108127594, "learning_rate": 0.0002522912562509471, "loss": 2.3476, "step": 95000 }, { "epoch": 4.486610272468103, "grad_norm": 0.33986231684684753, "learning_rate": 0.0002515343233823306, "loss": 2.3448, "step": 96000 }, { "epoch": 4.5333457961396455, "grad_norm": 0.33853477239608765, "learning_rate": 0.000250776632823155, "loss": 2.3463, "step": 97000 }, { "epoch": 4.580081319811189, "grad_norm": 0.3056027293205261, "learning_rate": 0.00025001894226397937, "loss": 2.3439, "step": 98000 }, { "epoch": 4.6268168434827315, "grad_norm": 0.330951064825058, "learning_rate": 0.0002492620093953629, "loss": 2.3468, "step": 99000 }, { "epoch": 4.673552367154274, "grad_norm": 0.32781165838241577, "learning_rate": 0.00024850431883618725, "loss": 2.3426, "step": 100000 }, { "epoch": 4.720287890825817, "grad_norm": 0.326337993144989, "learning_rate": 0.0002477473859675708, "loss": 2.3385, "step": 101000 }, { "epoch": 4.767023414497359, "grad_norm": 0.30979540944099426, "learning_rate": 0.0002469896954083952, "loss": 2.3517, "step": 102000 }, { "epoch": 4.8137589381689025, "grad_norm": 0.3351231813430786, "learning_rate": 0.00024623200484921953, "loss": 2.3306, "step": 103000 }, { "epoch": 4.860494461840445, "grad_norm": 0.3361092209815979, "learning_rate": 0.0002454750719806031, "loss": 2.3406, "step": 104000 }, { "epoch": 4.907229985511988, "grad_norm": 0.36400631070137024, "learning_rate": 0.00024471738142142746, "loss": 2.345, "step": 105000 }, { "epoch": 4.95396550918353, "grad_norm": 0.3127535581588745, "learning_rate": 0.000243960448552811, "loss": 2.3376, "step": 106000 }, { "epoch": 5.0, "eval_accuracy": 0.5100188341244781, "eval_loss": 2.524883985519409, "eval_runtime": 183.6153, "eval_samples_per_second": 383.002, "eval_steps_per_second": 5.985, "step": 106985 }, { "epoch": 5.000701032855073, "grad_norm": 0.3345338702201843, "learning_rate": 0.00024320275799363537, "loss": 2.3364, "step": 107000 }, { "epoch": 5.047436556526616, "grad_norm": 0.32806724309921265, "learning_rate": 0.00024244506743445975, "loss": 2.2779, "step": 108000 }, { "epoch": 5.094172080198159, "grad_norm": 0.3400112986564636, "learning_rate": 0.0002416881345658433, "loss": 2.287, "step": 109000 }, { "epoch": 5.140907603869701, "grad_norm": 0.2923315763473511, "learning_rate": 0.00024093120169722682, "loss": 2.2887, "step": 110000 }, { "epoch": 5.187643127541244, "grad_norm": 0.36175695061683655, "learning_rate": 0.0002401735111380512, "loss": 2.2956, "step": 111000 }, { "epoch": 5.234378651212787, "grad_norm": 0.32664230465888977, "learning_rate": 0.00023941582057887558, "loss": 2.2987, "step": 112000 }, { "epoch": 5.28111417488433, "grad_norm": 0.3379104733467102, "learning_rate": 0.00023865813001969994, "loss": 2.2949, "step": 113000 }, { "epoch": 5.327849698555872, "grad_norm": 0.3442465364933014, "learning_rate": 0.0002379011971510835, "loss": 2.2926, "step": 114000 }, { "epoch": 5.374585222227415, "grad_norm": 0.3272307217121124, "learning_rate": 0.00023714350659190784, "loss": 2.291, "step": 115000 }, { "epoch": 5.421320745898957, "grad_norm": 0.33245283365249634, "learning_rate": 0.0002363858160327322, "loss": 2.304, "step": 116000 }, { "epoch": 5.468056269570501, "grad_norm": 0.3197270333766937, "learning_rate": 0.00023562812547355658, "loss": 2.3024, "step": 117000 }, { "epoch": 5.514791793242043, "grad_norm": 0.32617905735969543, "learning_rate": 0.00023487119260494013, "loss": 2.3062, "step": 118000 }, { "epoch": 5.561527316913586, "grad_norm": 0.3404013514518738, "learning_rate": 0.00023411425973632365, "loss": 2.3055, "step": 119000 }, { "epoch": 5.6082628405851285, "grad_norm": 0.35346364974975586, "learning_rate": 0.00023335656917714803, "loss": 2.3029, "step": 120000 }, { "epoch": 5.654998364256672, "grad_norm": 0.33657070994377136, "learning_rate": 0.00023259963630853155, "loss": 2.3001, "step": 121000 }, { "epoch": 5.7017338879282145, "grad_norm": 0.3316746950149536, "learning_rate": 0.00023184194574935594, "loss": 2.2941, "step": 122000 }, { "epoch": 5.748469411599757, "grad_norm": 0.2895432114601135, "learning_rate": 0.00023108425519018032, "loss": 2.3032, "step": 123000 }, { "epoch": 5.7952049352712995, "grad_norm": 0.3094755709171295, "learning_rate": 0.00023032656463100467, "loss": 2.3081, "step": 124000 }, { "epoch": 5.841940458942842, "grad_norm": 0.3222903311252594, "learning_rate": 0.00022956887407182906, "loss": 2.3091, "step": 125000 }, { "epoch": 5.8886759826143855, "grad_norm": 0.34477153420448303, "learning_rate": 0.0002288111835126534, "loss": 2.2979, "step": 126000 }, { "epoch": 5.935411506285928, "grad_norm": 0.31651487946510315, "learning_rate": 0.00022805500833459613, "loss": 2.304, "step": 127000 }, { "epoch": 5.982147029957471, "grad_norm": 0.32441022992134094, "learning_rate": 0.00022729731777542048, "loss": 2.3036, "step": 128000 }, { "epoch": 6.0, "eval_accuracy": 0.5122081094080007, "eval_loss": 2.509770154953003, "eval_runtime": 183.5984, "eval_samples_per_second": 383.037, "eval_steps_per_second": 5.986, "step": 128382 }, { "epoch": 6.028882553629013, "grad_norm": 0.339528352022171, "learning_rate": 0.0002265396272162449, "loss": 2.2707, "step": 129000 }, { "epoch": 6.075618077300557, "grad_norm": 0.33392465114593506, "learning_rate": 0.00022578193665706925, "loss": 2.2395, "step": 130000 }, { "epoch": 6.122353600972099, "grad_norm": 0.3576413094997406, "learning_rate": 0.00022502500378845277, "loss": 2.2522, "step": 131000 }, { "epoch": 6.169089124643642, "grad_norm": 0.370988667011261, "learning_rate": 0.00022426807091983632, "loss": 2.2593, "step": 132000 }, { "epoch": 6.215824648315184, "grad_norm": 0.3223007619380951, "learning_rate": 0.00022351038036066067, "loss": 2.253, "step": 133000 }, { "epoch": 6.262560171986727, "grad_norm": 0.3452748954296112, "learning_rate": 0.00022275268980148503, "loss": 2.2628, "step": 134000 }, { "epoch": 6.30929569565827, "grad_norm": 0.33391740918159485, "learning_rate": 0.0002219957569328686, "loss": 2.26, "step": 135000 }, { "epoch": 6.356031219329813, "grad_norm": 0.32963138818740845, "learning_rate": 0.00022123806637369296, "loss": 2.2612, "step": 136000 }, { "epoch": 6.402766743001355, "grad_norm": 0.33252349495887756, "learning_rate": 0.0002204803758145173, "loss": 2.2659, "step": 137000 }, { "epoch": 6.449502266672898, "grad_norm": 0.3672574460506439, "learning_rate": 0.00021972344294590086, "loss": 2.2692, "step": 138000 }, { "epoch": 6.496237790344441, "grad_norm": 0.36064982414245605, "learning_rate": 0.00021896575238672524, "loss": 2.261, "step": 139000 }, { "epoch": 6.542973314015984, "grad_norm": 0.3338433802127838, "learning_rate": 0.0002182080618275496, "loss": 2.2741, "step": 140000 }, { "epoch": 6.589708837687526, "grad_norm": 0.36847883462905884, "learning_rate": 0.00021745112895893315, "loss": 2.2643, "step": 141000 }, { "epoch": 6.636444361359069, "grad_norm": 0.33524858951568604, "learning_rate": 0.0002166941960903167, "loss": 2.2804, "step": 142000 }, { "epoch": 6.6831798850306114, "grad_norm": 0.3156317174434662, "learning_rate": 0.00021593650553114108, "loss": 2.2583, "step": 143000 }, { "epoch": 6.729915408702155, "grad_norm": 0.3375246226787567, "learning_rate": 0.00021517881497196543, "loss": 2.2719, "step": 144000 }, { "epoch": 6.776650932373697, "grad_norm": 0.29037168622016907, "learning_rate": 0.0002144211244127898, "loss": 2.2612, "step": 145000 }, { "epoch": 6.82338645604524, "grad_norm": 0.37422460317611694, "learning_rate": 0.00021366419154417334, "loss": 2.2678, "step": 146000 }, { "epoch": 6.8701219797167825, "grad_norm": 0.3063635230064392, "learning_rate": 0.00021290650098499772, "loss": 2.2716, "step": 147000 }, { "epoch": 6.916857503388325, "grad_norm": 0.3298608660697937, "learning_rate": 0.00021214881042582208, "loss": 2.274, "step": 148000 }, { "epoch": 6.9635930270598685, "grad_norm": 0.342487096786499, "learning_rate": 0.00021139111986664643, "loss": 2.2667, "step": 149000 }, { "epoch": 7.0, "eval_accuracy": 0.5142437004663223, "eval_loss": 2.497164011001587, "eval_runtime": 183.5925, "eval_samples_per_second": 383.049, "eval_steps_per_second": 5.986, "step": 149779 }, { "epoch": 7.010328550731411, "grad_norm": 0.3236839175224304, "learning_rate": 0.00021063418699802998, "loss": 2.2595, "step": 150000 }, { "epoch": 7.057064074402954, "grad_norm": 0.3207620680332184, "learning_rate": 0.00020987649643885433, "loss": 2.2096, "step": 151000 }, { "epoch": 7.103799598074496, "grad_norm": 0.3556087911128998, "learning_rate": 0.0002091195635702379, "loss": 2.2121, "step": 152000 }, { "epoch": 7.15053512174604, "grad_norm": 0.3455691933631897, "learning_rate": 0.00020836187301106226, "loss": 2.225, "step": 153000 }, { "epoch": 7.197270645417582, "grad_norm": 0.3611804246902466, "learning_rate": 0.00020760418245188662, "loss": 2.2173, "step": 154000 }, { "epoch": 7.244006169089125, "grad_norm": 0.3559030592441559, "learning_rate": 0.00020684724958327017, "loss": 2.224, "step": 155000 }, { "epoch": 7.290741692760667, "grad_norm": 0.3480346202850342, "learning_rate": 0.0002060903167146537, "loss": 2.235, "step": 156000 }, { "epoch": 7.33747721643221, "grad_norm": 0.3583781123161316, "learning_rate": 0.0002053326261554781, "loss": 2.2285, "step": 157000 }, { "epoch": 7.384212740103753, "grad_norm": 0.35747647285461426, "learning_rate": 0.00020457493559630245, "loss": 2.2334, "step": 158000 }, { "epoch": 7.430948263775296, "grad_norm": 0.33115991950035095, "learning_rate": 0.0002038172450371268, "loss": 2.2417, "step": 159000 }, { "epoch": 7.477683787446838, "grad_norm": 0.362159788608551, "learning_rate": 0.0002030595544779512, "loss": 2.2351, "step": 160000 }, { "epoch": 7.524419311118381, "grad_norm": 0.3421652913093567, "learning_rate": 0.00020230262160933474, "loss": 2.2406, "step": 161000 }, { "epoch": 7.571154834789924, "grad_norm": 0.3309684991836548, "learning_rate": 0.0002015449310501591, "loss": 2.2393, "step": 162000 }, { "epoch": 7.617890358461467, "grad_norm": 0.36814552545547485, "learning_rate": 0.00020078724049098345, "loss": 2.2281, "step": 163000 }, { "epoch": 7.664625882133009, "grad_norm": 0.337954044342041, "learning_rate": 0.00020003030762236703, "loss": 2.2351, "step": 164000 }, { "epoch": 7.711361405804552, "grad_norm": 0.32359594106674194, "learning_rate": 0.00019927261706319138, "loss": 2.2397, "step": 165000 }, { "epoch": 7.758096929476094, "grad_norm": 0.36153310537338257, "learning_rate": 0.00019851568419457493, "loss": 2.2487, "step": 166000 }, { "epoch": 7.804832453147638, "grad_norm": 0.32292258739471436, "learning_rate": 0.00019775799363539929, "loss": 2.2384, "step": 167000 }, { "epoch": 7.85156797681918, "grad_norm": 0.3319338262081146, "learning_rate": 0.00019700030307622364, "loss": 2.2451, "step": 168000 }, { "epoch": 7.898303500490723, "grad_norm": 0.33854612708091736, "learning_rate": 0.00019624261251704802, "loss": 2.2433, "step": 169000 }, { "epoch": 7.9450390241622655, "grad_norm": 0.3438510298728943, "learning_rate": 0.00019548567964843157, "loss": 2.2488, "step": 170000 }, { "epoch": 7.991774547833808, "grad_norm": 0.3351077437400818, "learning_rate": 0.0001947287467798151, "loss": 2.2441, "step": 171000 }, { "epoch": 8.0, "eval_accuracy": 0.5155260938360413, "eval_loss": 2.488969326019287, "eval_runtime": 184.3466, "eval_samples_per_second": 381.482, "eval_steps_per_second": 5.962, "step": 171176 }, { "epoch": 8.038510071505351, "grad_norm": 0.34663721919059753, "learning_rate": 0.00019397105622063948, "loss": 2.1914, "step": 172000 }, { "epoch": 8.085245595176893, "grad_norm": 0.3605271875858307, "learning_rate": 0.00019321336566146386, "loss": 2.1837, "step": 173000 }, { "epoch": 8.131981118848437, "grad_norm": 0.3562985956668854, "learning_rate": 0.00019245643279284738, "loss": 2.1945, "step": 174000 }, { "epoch": 8.17871664251998, "grad_norm": 0.39077556133270264, "learning_rate": 0.00019169874223367176, "loss": 2.1957, "step": 175000 }, { "epoch": 8.225452166191522, "grad_norm": 0.3390072286128998, "learning_rate": 0.00019094180936505528, "loss": 2.1947, "step": 176000 }, { "epoch": 8.272187689863065, "grad_norm": 0.3693525195121765, "learning_rate": 0.00019018411880587964, "loss": 2.2082, "step": 177000 }, { "epoch": 8.318923213534609, "grad_norm": 0.3887868821620941, "learning_rate": 0.0001894271859372632, "loss": 2.2082, "step": 178000 }, { "epoch": 8.36565873720615, "grad_norm": 0.3565722107887268, "learning_rate": 0.00018866949537808757, "loss": 2.2017, "step": 179000 }, { "epoch": 8.412394260877694, "grad_norm": 0.3344291150569916, "learning_rate": 0.00018791256250947112, "loss": 2.2127, "step": 180000 }, { "epoch": 8.459129784549235, "grad_norm": 0.3664642572402954, "learning_rate": 0.00018715487195029547, "loss": 2.2137, "step": 181000 }, { "epoch": 8.505865308220779, "grad_norm": 0.36379629373550415, "learning_rate": 0.00018639718139111985, "loss": 2.2126, "step": 182000 }, { "epoch": 8.552600831892322, "grad_norm": 0.34674862027168274, "learning_rate": 0.0001856402485225034, "loss": 2.2106, "step": 183000 }, { "epoch": 8.599336355563864, "grad_norm": 0.3491998314857483, "learning_rate": 0.00018488255796332776, "loss": 2.2122, "step": 184000 }, { "epoch": 8.646071879235407, "grad_norm": 0.3424486219882965, "learning_rate": 0.00018412486740415211, "loss": 2.2144, "step": 185000 }, { "epoch": 8.692807402906949, "grad_norm": 0.37751293182373047, "learning_rate": 0.00018336717684497647, "loss": 2.2133, "step": 186000 }, { "epoch": 8.739542926578492, "grad_norm": 0.3451727628707886, "learning_rate": 0.00018261024397636004, "loss": 2.2104, "step": 187000 }, { "epoch": 8.786278450250036, "grad_norm": 0.34991177916526794, "learning_rate": 0.0001818533111077436, "loss": 2.2203, "step": 188000 }, { "epoch": 8.833013973921577, "grad_norm": 0.3395465314388275, "learning_rate": 0.00018109562054856795, "loss": 2.2196, "step": 189000 }, { "epoch": 8.87974949759312, "grad_norm": 0.3524405360221863, "learning_rate": 0.0001803379299893923, "loss": 2.2167, "step": 190000 }, { "epoch": 8.926485021264662, "grad_norm": 0.37763360142707825, "learning_rate": 0.00017958023943021669, "loss": 2.218, "step": 191000 }, { "epoch": 8.973220544936206, "grad_norm": 0.33627304434776306, "learning_rate": 0.00017882254887104104, "loss": 2.2176, "step": 192000 }, { "epoch": 9.0, "eval_accuracy": 0.5163104633250385, "eval_loss": 2.4842023849487305, "eval_runtime": 183.7529, "eval_samples_per_second": 382.715, "eval_steps_per_second": 5.981, "step": 192573 }, { "epoch": 9.01995606860775, "grad_norm": 0.33450332283973694, "learning_rate": 0.0001780656160024246, "loss": 2.1874, "step": 193000 }, { "epoch": 9.066691592279291, "grad_norm": 0.3548380434513092, "learning_rate": 0.00017730792544324895, "loss": 2.1647, "step": 194000 }, { "epoch": 9.113427115950834, "grad_norm": 0.37969735264778137, "learning_rate": 0.00017655023488407333, "loss": 2.1695, "step": 195000 }, { "epoch": 9.160162639622378, "grad_norm": 0.3634478747844696, "learning_rate": 0.00017579330201545688, "loss": 2.1706, "step": 196000 }, { "epoch": 9.20689816329392, "grad_norm": 0.3782573640346527, "learning_rate": 0.00017503561145628123, "loss": 2.1699, "step": 197000 }, { "epoch": 9.253633686965463, "grad_norm": 0.3733539283275604, "learning_rate": 0.00017427867858766478, "loss": 2.1783, "step": 198000 }, { "epoch": 9.300369210637005, "grad_norm": 0.367725133895874, "learning_rate": 0.00017352098802848916, "loss": 2.1825, "step": 199000 }, { "epoch": 9.347104734308548, "grad_norm": 0.3784054219722748, "learning_rate": 0.0001727640551598727, "loss": 2.1842, "step": 200000 }, { "epoch": 9.393840257980091, "grad_norm": 0.37121427059173584, "learning_rate": 0.00017200636460069707, "loss": 2.1854, "step": 201000 }, { "epoch": 9.440575781651633, "grad_norm": 0.3519172668457031, "learning_rate": 0.00017124867404152142, "loss": 2.1841, "step": 202000 }, { "epoch": 9.487311305323177, "grad_norm": 0.37216153740882874, "learning_rate": 0.00017049098348234578, "loss": 2.185, "step": 203000 }, { "epoch": 9.534046828994718, "grad_norm": 0.3630688488483429, "learning_rate": 0.00016973405061372935, "loss": 2.1849, "step": 204000 }, { "epoch": 9.580782352666262, "grad_norm": 0.33998388051986694, "learning_rate": 0.0001689763600545537, "loss": 2.1911, "step": 205000 }, { "epoch": 9.627517876337805, "grad_norm": 0.3803759515285492, "learning_rate": 0.00016821942718593726, "loss": 2.1905, "step": 206000 }, { "epoch": 9.674253400009347, "grad_norm": 0.3438846170902252, "learning_rate": 0.0001674617366267616, "loss": 2.1997, "step": 207000 }, { "epoch": 9.72098892368089, "grad_norm": 0.3872727155685425, "learning_rate": 0.000166704046067586, "loss": 2.1918, "step": 208000 }, { "epoch": 9.767724447352432, "grad_norm": 0.32597804069519043, "learning_rate": 0.00016594635550841035, "loss": 2.1798, "step": 209000 }, { "epoch": 9.814459971023975, "grad_norm": 0.3644990026950836, "learning_rate": 0.00016519018033035306, "loss": 2.1933, "step": 210000 }, { "epoch": 9.861195494695519, "grad_norm": 0.3531164228916168, "learning_rate": 0.00016443248977117742, "loss": 2.1937, "step": 211000 }, { "epoch": 9.90793101836706, "grad_norm": 0.3724701702594757, "learning_rate": 0.00016367479921200183, "loss": 2.2031, "step": 212000 }, { "epoch": 9.954666542038604, "grad_norm": 0.3369377851486206, "learning_rate": 0.00016291786634338535, "loss": 2.1899, "step": 213000 }, { "epoch": 10.0, "eval_accuracy": 0.5169506283850192, "eval_loss": 2.4832394123077393, "eval_runtime": 183.5783, "eval_samples_per_second": 383.079, "eval_steps_per_second": 5.987, "step": 213970 }, { "epoch": 10.001402065710145, "grad_norm": 0.3664819300174713, "learning_rate": 0.0001621609334747689, "loss": 2.1935, "step": 214000 }, { "epoch": 10.048137589381689, "grad_norm": 0.3685780465602875, "learning_rate": 0.00016140324291559325, "loss": 2.1375, "step": 215000 }, { "epoch": 10.094873113053232, "grad_norm": 0.36441028118133545, "learning_rate": 0.0001606455523564176, "loss": 2.1397, "step": 216000 }, { "epoch": 10.141608636724774, "grad_norm": 0.3830730617046356, "learning_rate": 0.000159887861797242, "loss": 2.1525, "step": 217000 }, { "epoch": 10.188344160396317, "grad_norm": 0.36895737051963806, "learning_rate": 0.00015913017123806637, "loss": 2.1468, "step": 218000 }, { "epoch": 10.23507968406786, "grad_norm": 0.35749468207359314, "learning_rate": 0.00015837248067889073, "loss": 2.1526, "step": 219000 }, { "epoch": 10.281815207739402, "grad_norm": 0.38449355959892273, "learning_rate": 0.00015761554781027425, "loss": 2.151, "step": 220000 }, { "epoch": 10.328550731410946, "grad_norm": 0.39128056168556213, "learning_rate": 0.00015685785725109866, "loss": 2.1625, "step": 221000 }, { "epoch": 10.375286255082488, "grad_norm": 0.3806777596473694, "learning_rate": 0.00015610168207304137, "loss": 2.1613, "step": 222000 }, { "epoch": 10.422021778754031, "grad_norm": 0.3579350411891937, "learning_rate": 0.00015534399151386573, "loss": 2.1629, "step": 223000 }, { "epoch": 10.468757302425574, "grad_norm": 0.3410700857639313, "learning_rate": 0.00015458705864524925, "loss": 2.162, "step": 224000 }, { "epoch": 10.515492826097116, "grad_norm": 0.36416828632354736, "learning_rate": 0.00015382936808607366, "loss": 2.1618, "step": 225000 }, { "epoch": 10.56222834976866, "grad_norm": 0.3888450860977173, "learning_rate": 0.00015307243521745718, "loss": 2.1703, "step": 226000 }, { "epoch": 10.608963873440201, "grad_norm": 0.3778453469276428, "learning_rate": 0.00015231474465828154, "loss": 2.1659, "step": 227000 }, { "epoch": 10.655699397111745, "grad_norm": 0.3809790313243866, "learning_rate": 0.00015155781178966508, "loss": 2.1786, "step": 228000 }, { "epoch": 10.702434920783288, "grad_norm": 0.35307931900024414, "learning_rate": 0.00015080012123048944, "loss": 2.1743, "step": 229000 }, { "epoch": 10.74917044445483, "grad_norm": 0.34989428520202637, "learning_rate": 0.0001500424306713138, "loss": 2.1838, "step": 230000 }, { "epoch": 10.795905968126373, "grad_norm": 0.372887521982193, "learning_rate": 0.00014928625549325654, "loss": 2.1731, "step": 231000 }, { "epoch": 10.842641491797915, "grad_norm": 0.34077638387680054, "learning_rate": 0.0001485285649340809, "loss": 2.1707, "step": 232000 }, { "epoch": 10.889377015469458, "grad_norm": 0.38488736748695374, "learning_rate": 0.00014777087437490527, "loss": 2.1761, "step": 233000 }, { "epoch": 10.936112539141002, "grad_norm": 0.36446672677993774, "learning_rate": 0.00014701318381572963, "loss": 2.1727, "step": 234000 }, { "epoch": 10.982848062812543, "grad_norm": 0.4053245186805725, "learning_rate": 0.000146255493256554, "loss": 2.1699, "step": 235000 }, { "epoch": 11.0, "eval_accuracy": 0.5176925060817075, "eval_loss": 2.4809539318084717, "eval_runtime": 183.5125, "eval_samples_per_second": 383.216, "eval_steps_per_second": 5.989, "step": 235367 }, { "epoch": 11.029583586484087, "grad_norm": 0.37868165969848633, "learning_rate": 0.00014549856038793756, "loss": 2.1413, "step": 236000 }, { "epoch": 11.076319110155628, "grad_norm": 0.366122841835022, "learning_rate": 0.00014474086982876192, "loss": 2.1282, "step": 237000 }, { "epoch": 11.123054633827172, "grad_norm": 0.35669025778770447, "learning_rate": 0.0001439831792695863, "loss": 2.1253, "step": 238000 }, { "epoch": 11.169790157498715, "grad_norm": 0.43152889609336853, "learning_rate": 0.00014322548871041065, "loss": 2.1358, "step": 239000 }, { "epoch": 11.216525681170257, "grad_norm": 0.37372422218322754, "learning_rate": 0.00014246931353235337, "loss": 2.1377, "step": 240000 }, { "epoch": 11.2632612048418, "grad_norm": 0.37025725841522217, "learning_rate": 0.00014171162297317772, "loss": 2.1344, "step": 241000 }, { "epoch": 11.309996728513344, "grad_norm": 0.37420186400413513, "learning_rate": 0.0001409539324140021, "loss": 2.1372, "step": 242000 }, { "epoch": 11.356732252184885, "grad_norm": 0.4168432652950287, "learning_rate": 0.00014019699954538565, "loss": 2.1358, "step": 243000 }, { "epoch": 11.403467775856429, "grad_norm": 0.38983049988746643, "learning_rate": 0.00013943930898621, "loss": 2.1417, "step": 244000 }, { "epoch": 11.45020329952797, "grad_norm": 0.42163345217704773, "learning_rate": 0.00013868237611759356, "loss": 2.1349, "step": 245000 }, { "epoch": 11.496938823199514, "grad_norm": 0.33661991357803345, "learning_rate": 0.00013792468555841794, "loss": 2.1423, "step": 246000 }, { "epoch": 11.543674346871057, "grad_norm": 0.382899671792984, "learning_rate": 0.00013716775268980146, "loss": 2.1383, "step": 247000 }, { "epoch": 11.590409870542599, "grad_norm": 0.38625696301460266, "learning_rate": 0.00013641006213062584, "loss": 2.1489, "step": 248000 }, { "epoch": 11.637145394214143, "grad_norm": 0.3666653633117676, "learning_rate": 0.0001356523715714502, "loss": 2.1569, "step": 249000 }, { "epoch": 11.683880917885684, "grad_norm": 0.36036914587020874, "learning_rate": 0.00013489543870283375, "loss": 2.1537, "step": 250000 }, { "epoch": 11.730616441557228, "grad_norm": 0.38993409276008606, "learning_rate": 0.00013413774814365813, "loss": 2.1538, "step": 251000 }, { "epoch": 11.777351965228771, "grad_norm": 0.37135306000709534, "learning_rate": 0.00013338081527504165, "loss": 2.1466, "step": 252000 }, { "epoch": 11.824087488900313, "grad_norm": 0.37696388363838196, "learning_rate": 0.00013262312471586603, "loss": 2.1556, "step": 253000 }, { "epoch": 11.870823012571856, "grad_norm": 0.40719184279441833, "learning_rate": 0.00013186543415669042, "loss": 2.1623, "step": 254000 }, { "epoch": 11.917558536243398, "grad_norm": 0.3529217541217804, "learning_rate": 0.00013110774359751477, "loss": 2.1526, "step": 255000 }, { "epoch": 11.964294059914941, "grad_norm": 0.38155487179756165, "learning_rate": 0.0001303508107288983, "loss": 2.1554, "step": 256000 }, { "epoch": 12.0, "eval_accuracy": 0.5181830576524957, "eval_loss": 2.4812095165252686, "eval_runtime": 183.6323, "eval_samples_per_second": 382.966, "eval_steps_per_second": 5.985, "step": 256764 }, { "epoch": 12.011029583586485, "grad_norm": 0.3908240795135498, "learning_rate": 0.00012959387786028184, "loss": 2.1357, "step": 257000 }, { "epoch": 12.057765107258026, "grad_norm": 0.38744133710861206, "learning_rate": 0.00012883618730110622, "loss": 2.1031, "step": 258000 }, { "epoch": 12.10450063092957, "grad_norm": 0.39171701669692993, "learning_rate": 0.00012807849674193058, "loss": 2.1114, "step": 259000 }, { "epoch": 12.151236154601113, "grad_norm": 0.41075876355171204, "learning_rate": 0.00012732080618275496, "loss": 2.1134, "step": 260000 }, { "epoch": 12.197971678272655, "grad_norm": 0.4411497712135315, "learning_rate": 0.0001265638733141385, "loss": 2.1155, "step": 261000 }, { "epoch": 12.244707201944198, "grad_norm": 0.3990738093852997, "learning_rate": 0.00012580618275496286, "loss": 2.1205, "step": 262000 }, { "epoch": 12.29144272561574, "grad_norm": 0.4064374566078186, "learning_rate": 0.00012504924988634639, "loss": 2.1166, "step": 263000 }, { "epoch": 12.338178249287283, "grad_norm": 0.40419068932533264, "learning_rate": 0.00012429155932717077, "loss": 2.1222, "step": 264000 }, { "epoch": 12.384913772958827, "grad_norm": 0.37660491466522217, "learning_rate": 0.00012353462645855432, "loss": 2.1313, "step": 265000 }, { "epoch": 12.431649296630368, "grad_norm": 0.4027993381023407, "learning_rate": 0.00012277693589937867, "loss": 2.1206, "step": 266000 }, { "epoch": 12.478384820301912, "grad_norm": 0.39191654324531555, "learning_rate": 0.00012201924534020305, "loss": 2.1279, "step": 267000 }, { "epoch": 12.525120343973454, "grad_norm": 0.41823610663414, "learning_rate": 0.00012126231247158659, "loss": 2.1291, "step": 268000 }, { "epoch": 12.571855867644997, "grad_norm": 0.3858267664909363, "learning_rate": 0.00012050462191241096, "loss": 2.1265, "step": 269000 }, { "epoch": 12.61859139131654, "grad_norm": 0.4283806383609772, "learning_rate": 0.00011974693135323533, "loss": 2.1296, "step": 270000 }, { "epoch": 12.665326914988082, "grad_norm": 0.36854031682014465, "learning_rate": 0.0001189892407940597, "loss": 2.1351, "step": 271000 }, { "epoch": 12.712062438659625, "grad_norm": 0.3683795928955078, "learning_rate": 0.00011823230792544323, "loss": 2.1203, "step": 272000 }, { "epoch": 12.758797962331167, "grad_norm": 0.40869027376174927, "learning_rate": 0.00011747461736626761, "loss": 2.1343, "step": 273000 }, { "epoch": 12.80553348600271, "grad_norm": 0.3861792981624603, "learning_rate": 0.00011671768449765115, "loss": 2.1327, "step": 274000 }, { "epoch": 12.852269009674254, "grad_norm": 0.3687632381916046, "learning_rate": 0.00011595999393847552, "loss": 2.1362, "step": 275000 }, { "epoch": 12.899004533345796, "grad_norm": 0.40457507967948914, "learning_rate": 0.00011520230337929988, "loss": 2.1375, "step": 276000 }, { "epoch": 12.945740057017339, "grad_norm": 0.37788259983062744, "learning_rate": 0.00011444537051068342, "loss": 2.1384, "step": 277000 }, { "epoch": 12.992475580688883, "grad_norm": 0.3549434542655945, "learning_rate": 0.00011368767995150779, "loss": 2.131, "step": 278000 }, { "epoch": 13.0, "eval_accuracy": 0.5186614527787653, "eval_loss": 2.4790453910827637, "eval_runtime": 183.9494, "eval_samples_per_second": 382.306, "eval_steps_per_second": 5.974, "step": 278161 }, { "epoch": 13.039211104360424, "grad_norm": 0.38944530487060547, "learning_rate": 0.00011292998939233217, "loss": 2.0927, "step": 279000 }, { "epoch": 13.085946628031968, "grad_norm": 0.3954192101955414, "learning_rate": 0.0001121730565237157, "loss": 2.0879, "step": 280000 }, { "epoch": 13.13268215170351, "grad_norm": 0.39661821722984314, "learning_rate": 0.00011141536596454006, "loss": 2.0967, "step": 281000 }, { "epoch": 13.179417675375053, "grad_norm": 0.4078295826911926, "learning_rate": 0.00011065767540536444, "loss": 2.0992, "step": 282000 }, { "epoch": 13.226153199046596, "grad_norm": 0.41279336810112, "learning_rate": 0.00010990074253674798, "loss": 2.1065, "step": 283000 }, { "epoch": 13.272888722718138, "grad_norm": 0.4141485095024109, "learning_rate": 0.00010914305197757235, "loss": 2.101, "step": 284000 }, { "epoch": 13.319624246389681, "grad_norm": 0.37772706151008606, "learning_rate": 0.00010838536141839672, "loss": 2.1033, "step": 285000 }, { "epoch": 13.366359770061223, "grad_norm": 0.42048418521881104, "learning_rate": 0.00010762842854978026, "loss": 2.1037, "step": 286000 }, { "epoch": 13.413095293732766, "grad_norm": 0.3956892192363739, "learning_rate": 0.00010687073799060462, "loss": 2.1035, "step": 287000 }, { "epoch": 13.45983081740431, "grad_norm": 0.40276169776916504, "learning_rate": 0.000106113047431429, "loss": 2.1035, "step": 288000 }, { "epoch": 13.506566341075851, "grad_norm": 0.37681591510772705, "learning_rate": 0.00010535611456281254, "loss": 2.1085, "step": 289000 }, { "epoch": 13.553301864747395, "grad_norm": 0.4368800222873688, "learning_rate": 0.0001045984240036369, "loss": 2.1182, "step": 290000 }, { "epoch": 13.600037388418937, "grad_norm": 0.3877386152744293, "learning_rate": 0.00010384073344446127, "loss": 2.1126, "step": 291000 }, { "epoch": 13.64677291209048, "grad_norm": 0.3556603491306305, "learning_rate": 0.00010308380057584481, "loss": 2.1111, "step": 292000 }, { "epoch": 13.693508435762023, "grad_norm": 0.40014851093292236, "learning_rate": 0.00010232611001666918, "loss": 2.1132, "step": 293000 }, { "epoch": 13.740243959433565, "grad_norm": 0.4012806713581085, "learning_rate": 0.00010156917714805273, "loss": 2.1129, "step": 294000 }, { "epoch": 13.786979483105108, "grad_norm": 0.42094719409942627, "learning_rate": 0.0001008114865888771, "loss": 2.1169, "step": 295000 }, { "epoch": 13.83371500677665, "grad_norm": 0.40798112750053406, "learning_rate": 0.00010005379602970145, "loss": 2.1097, "step": 296000 }, { "epoch": 13.880450530448194, "grad_norm": 0.3869474530220032, "learning_rate": 9.929686316108501e-05, "loss": 2.1168, "step": 297000 }, { "epoch": 13.927186054119737, "grad_norm": 0.4175203740596771, "learning_rate": 9.853917260190937e-05, "loss": 2.1117, "step": 298000 }, { "epoch": 13.973921577791279, "grad_norm": 0.36646854877471924, "learning_rate": 9.778223973329292e-05, "loss": 2.1231, "step": 299000 }, { "epoch": 14.0, "eval_accuracy": 0.5187720318130807, "eval_loss": 2.481027841567993, "eval_runtime": 184.1596, "eval_samples_per_second": 381.87, "eval_steps_per_second": 5.968, "step": 299558 }, { "epoch": 14.020657101462822, "grad_norm": 0.3873392343521118, "learning_rate": 9.702530686467645e-05, "loss": 2.1009, "step": 300000 }, { "epoch": 14.067392625134366, "grad_norm": 0.4304318428039551, "learning_rate": 9.626761630550083e-05, "loss": 2.0792, "step": 301000 }, { "epoch": 14.114128148805907, "grad_norm": 0.36497995257377625, "learning_rate": 9.550992574632519e-05, "loss": 2.0768, "step": 302000 }, { "epoch": 14.16086367247745, "grad_norm": 0.38602423667907715, "learning_rate": 9.475223518714957e-05, "loss": 2.0741, "step": 303000 }, { "epoch": 14.207599196148992, "grad_norm": 0.4310854375362396, "learning_rate": 9.399454462797393e-05, "loss": 2.078, "step": 304000 }, { "epoch": 14.254334719820536, "grad_norm": 0.3950233459472656, "learning_rate": 9.323761175935746e-05, "loss": 2.0915, "step": 305000 }, { "epoch": 14.30107024349208, "grad_norm": 0.40887415409088135, "learning_rate": 9.247992120018184e-05, "loss": 2.0884, "step": 306000 }, { "epoch": 14.34780576716362, "grad_norm": 0.4131264388561249, "learning_rate": 9.172298833156538e-05, "loss": 2.0976, "step": 307000 }, { "epoch": 14.394541290835164, "grad_norm": 0.41139036417007446, "learning_rate": 9.096529777238975e-05, "loss": 2.0886, "step": 308000 }, { "epoch": 14.441276814506706, "grad_norm": 0.44874390959739685, "learning_rate": 9.020760721321412e-05, "loss": 2.0889, "step": 309000 }, { "epoch": 14.48801233817825, "grad_norm": 0.38657814264297485, "learning_rate": 8.944991665403849e-05, "loss": 2.0933, "step": 310000 }, { "epoch": 14.534747861849793, "grad_norm": 0.3783547580242157, "learning_rate": 8.869222609486284e-05, "loss": 2.0852, "step": 311000 }, { "epoch": 14.581483385521334, "grad_norm": 0.35217922925949097, "learning_rate": 8.79352932262464e-05, "loss": 2.0961, "step": 312000 }, { "epoch": 14.628218909192878, "grad_norm": 0.3531019389629364, "learning_rate": 8.717760266707076e-05, "loss": 2.0972, "step": 313000 }, { "epoch": 14.67495443286442, "grad_norm": 0.43090537190437317, "learning_rate": 8.64206697984543e-05, "loss": 2.0943, "step": 314000 }, { "epoch": 14.721689956535963, "grad_norm": 0.41564884781837463, "learning_rate": 8.566297923927867e-05, "loss": 2.0997, "step": 315000 }, { "epoch": 14.768425480207506, "grad_norm": 0.4131323993206024, "learning_rate": 8.490604637066222e-05, "loss": 2.0906, "step": 316000 }, { "epoch": 14.815161003879048, "grad_norm": 0.40813466906547546, "learning_rate": 8.414835581148658e-05, "loss": 2.103, "step": 317000 }, { "epoch": 14.861896527550591, "grad_norm": 0.4064253866672516, "learning_rate": 8.339142294287011e-05, "loss": 2.1013, "step": 318000 }, { "epoch": 14.908632051222135, "grad_norm": 0.4227573275566101, "learning_rate": 8.26337323836945e-05, "loss": 2.0943, "step": 319000 }, { "epoch": 14.955367574893677, "grad_norm": 0.4152744710445404, "learning_rate": 8.187679951507803e-05, "loss": 2.0991, "step": 320000 }, { "epoch": 15.0, "eval_accuracy": 0.5190574852052445, "eval_loss": 2.4825658798217773, "eval_runtime": 183.7288, "eval_samples_per_second": 382.765, "eval_steps_per_second": 5.982, "step": 320955 }, { "epoch": 15.00210309856522, "grad_norm": 0.39815834164619446, "learning_rate": 8.11191089559024e-05, "loss": 2.0967, "step": 321000 }, { "epoch": 15.048838622236762, "grad_norm": 0.39639827609062195, "learning_rate": 8.036217608728595e-05, "loss": 2.0557, "step": 322000 }, { "epoch": 15.095574145908305, "grad_norm": 0.4057682752609253, "learning_rate": 7.960448552811032e-05, "loss": 2.0602, "step": 323000 }, { "epoch": 15.142309669579848, "grad_norm": 0.40997353196144104, "learning_rate": 7.884679496893467e-05, "loss": 2.063, "step": 324000 }, { "epoch": 15.18904519325139, "grad_norm": 0.3955094814300537, "learning_rate": 7.808910440975905e-05, "loss": 2.0665, "step": 325000 }, { "epoch": 15.235780716922934, "grad_norm": 0.43070030212402344, "learning_rate": 7.733141385058341e-05, "loss": 2.0696, "step": 326000 }, { "epoch": 15.282516240594475, "grad_norm": 0.42902061343193054, "learning_rate": 7.657523867252612e-05, "loss": 2.0727, "step": 327000 }, { "epoch": 15.329251764266019, "grad_norm": 0.42950913310050964, "learning_rate": 7.58175481133505e-05, "loss": 2.0748, "step": 328000 }, { "epoch": 15.375987287937562, "grad_norm": 0.4149567782878876, "learning_rate": 7.505985755417486e-05, "loss": 2.0752, "step": 329000 }, { "epoch": 15.422722811609104, "grad_norm": 0.4134847819805145, "learning_rate": 7.430216699499924e-05, "loss": 2.0752, "step": 330000 }, { "epoch": 15.469458335280647, "grad_norm": 0.3985021412372589, "learning_rate": 7.354447643582361e-05, "loss": 2.0776, "step": 331000 }, { "epoch": 15.516193858952189, "grad_norm": 0.4318609833717346, "learning_rate": 7.278754356720715e-05, "loss": 2.0797, "step": 332000 }, { "epoch": 15.562929382623732, "grad_norm": 0.4110203683376312, "learning_rate": 7.202985300803152e-05, "loss": 2.0811, "step": 333000 }, { "epoch": 15.609664906295276, "grad_norm": 0.423392653465271, "learning_rate": 7.127292013941505e-05, "loss": 2.0744, "step": 334000 }, { "epoch": 15.656400429966817, "grad_norm": 0.40932226181030273, "learning_rate": 7.051522958023942e-05, "loss": 2.0779, "step": 335000 }, { "epoch": 15.70313595363836, "grad_norm": 0.42987844347953796, "learning_rate": 6.975829671162297e-05, "loss": 2.0712, "step": 336000 }, { "epoch": 15.749871477309902, "grad_norm": 0.42654335498809814, "learning_rate": 6.900060615244734e-05, "loss": 2.076, "step": 337000 }, { "epoch": 15.796607000981446, "grad_norm": 0.45028138160705566, "learning_rate": 6.82429155932717e-05, "loss": 2.08, "step": 338000 }, { "epoch": 15.84334252465299, "grad_norm": 0.43925535678863525, "learning_rate": 6.748598272465524e-05, "loss": 2.0885, "step": 339000 }, { "epoch": 15.890078048324531, "grad_norm": 0.4209968149662018, "learning_rate": 6.672904985603879e-05, "loss": 2.0859, "step": 340000 }, { "epoch": 15.936813571996074, "grad_norm": 0.3998005986213684, "learning_rate": 6.597135929686316e-05, "loss": 2.0839, "step": 341000 }, { "epoch": 15.983549095667616, "grad_norm": 0.3756837844848633, "learning_rate": 6.521366873768751e-05, "loss": 2.0851, "step": 342000 }, { "epoch": 16.0, "eval_accuracy": 0.519193548162244, "eval_loss": 2.4821929931640625, "eval_runtime": 183.7167, "eval_samples_per_second": 382.79, "eval_steps_per_second": 5.982, "step": 342352 }, { "epoch": 16.03028461933916, "grad_norm": 0.46294063329696655, "learning_rate": 6.445597817851188e-05, "loss": 2.0601, "step": 343000 }, { "epoch": 16.077020143010703, "grad_norm": 0.441850483417511, "learning_rate": 6.369904530989543e-05, "loss": 2.0523, "step": 344000 }, { "epoch": 16.123755666682246, "grad_norm": 0.468127578496933, "learning_rate": 6.29413547507198e-05, "loss": 2.0485, "step": 345000 }, { "epoch": 16.170491190353786, "grad_norm": 0.41471847891807556, "learning_rate": 6.218366419154417e-05, "loss": 2.0433, "step": 346000 }, { "epoch": 16.21722671402533, "grad_norm": 0.4540562927722931, "learning_rate": 6.142673132292772e-05, "loss": 2.0585, "step": 347000 }, { "epoch": 16.263962237696873, "grad_norm": 0.4675719738006592, "learning_rate": 6.066904076375207e-05, "loss": 2.0585, "step": 348000 }, { "epoch": 16.310697761368417, "grad_norm": 0.43343010544776917, "learning_rate": 5.991135020457644e-05, "loss": 2.0519, "step": 349000 }, { "epoch": 16.35743328503996, "grad_norm": 0.4093555808067322, "learning_rate": 5.915365964540082e-05, "loss": 2.0601, "step": 350000 }, { "epoch": 16.404168808711503, "grad_norm": 0.4201803505420685, "learning_rate": 5.8395969086225186e-05, "loss": 2.059, "step": 351000 }, { "epoch": 16.450904332383043, "grad_norm": 0.4299643337726593, "learning_rate": 5.763903621760872e-05, "loss": 2.0602, "step": 352000 }, { "epoch": 16.497639856054587, "grad_norm": 0.42913755774497986, "learning_rate": 5.688134565843309e-05, "loss": 2.0529, "step": 353000 }, { "epoch": 16.54437537972613, "grad_norm": 0.39006081223487854, "learning_rate": 5.612441278981663e-05, "loss": 2.0703, "step": 354000 }, { "epoch": 16.591110903397674, "grad_norm": 0.4356352388858795, "learning_rate": 5.536747992120018e-05, "loss": 2.0618, "step": 355000 }, { "epoch": 16.637846427069217, "grad_norm": 0.4199896454811096, "learning_rate": 5.4610547052583714e-05, "loss": 2.0674, "step": 356000 }, { "epoch": 16.684581950740757, "grad_norm": 0.4299008846282959, "learning_rate": 5.385285649340809e-05, "loss": 2.0648, "step": 357000 }, { "epoch": 16.7313174744123, "grad_norm": 0.4732346832752228, "learning_rate": 5.309516593423246e-05, "loss": 2.0708, "step": 358000 }, { "epoch": 16.778052998083844, "grad_norm": 0.45810797810554504, "learning_rate": 5.233747537505683e-05, "loss": 2.0599, "step": 359000 }, { "epoch": 16.824788521755387, "grad_norm": 0.42283934354782104, "learning_rate": 5.157978481588119e-05, "loss": 2.0613, "step": 360000 }, { "epoch": 16.87152404542693, "grad_norm": 0.44432106614112854, "learning_rate": 5.082209425670556e-05, "loss": 2.0623, "step": 361000 }, { "epoch": 16.91825956909847, "grad_norm": 0.431477814912796, "learning_rate": 5.006591907864827e-05, "loss": 2.0676, "step": 362000 }, { "epoch": 16.964995092770014, "grad_norm": 0.40297403931617737, "learning_rate": 4.930822851947264e-05, "loss": 2.0694, "step": 363000 }, { "epoch": 17.0, "eval_accuracy": 0.519240556798617, "eval_loss": 2.4870810508728027, "eval_runtime": 184.4531, "eval_samples_per_second": 381.262, "eval_steps_per_second": 5.958, "step": 363749 }, { "epoch": 17.011730616441557, "grad_norm": 0.42919251322746277, "learning_rate": 4.855053796029701e-05, "loss": 2.0591, "step": 364000 }, { "epoch": 17.0584661401131, "grad_norm": 0.4822104573249817, "learning_rate": 4.779284740112138e-05, "loss": 2.0356, "step": 365000 }, { "epoch": 17.105201663784644, "grad_norm": 0.44094109535217285, "learning_rate": 4.703591453250492e-05, "loss": 2.0376, "step": 366000 }, { "epoch": 17.151937187456184, "grad_norm": 0.3879323899745941, "learning_rate": 4.627822397332929e-05, "loss": 2.0357, "step": 367000 }, { "epoch": 17.198672711127728, "grad_norm": 0.4346221387386322, "learning_rate": 4.552053341415366e-05, "loss": 2.0424, "step": 368000 }, { "epoch": 17.24540823479927, "grad_norm": 0.42520612478256226, "learning_rate": 4.476284285497802e-05, "loss": 2.0422, "step": 369000 }, { "epoch": 17.292143758470814, "grad_norm": 0.3917822241783142, "learning_rate": 4.400515229580239e-05, "loss": 2.0312, "step": 370000 }, { "epoch": 17.338879282142358, "grad_norm": 0.4468395411968231, "learning_rate": 4.3248977117745104e-05, "loss": 2.0453, "step": 371000 }, { "epoch": 17.385614805813898, "grad_norm": 0.4312225580215454, "learning_rate": 4.249128655856948e-05, "loss": 2.0439, "step": 372000 }, { "epoch": 17.43235032948544, "grad_norm": 0.4082895815372467, "learning_rate": 4.173359599939385e-05, "loss": 2.0425, "step": 373000 }, { "epoch": 17.479085853156985, "grad_norm": 0.4173845052719116, "learning_rate": 4.097590544021822e-05, "loss": 2.0478, "step": 374000 }, { "epoch": 17.525821376828528, "grad_norm": 0.47007617354393005, "learning_rate": 4.021897257160175e-05, "loss": 2.0374, "step": 375000 }, { "epoch": 17.57255690050007, "grad_norm": 0.45107054710388184, "learning_rate": 3.94620397029853e-05, "loss": 2.0552, "step": 376000 }, { "epoch": 17.61929242417161, "grad_norm": 0.43024107813835144, "learning_rate": 3.870510683436884e-05, "loss": 2.0489, "step": 377000 }, { "epoch": 17.666027947843155, "grad_norm": 0.40413206815719604, "learning_rate": 3.794741627519321e-05, "loss": 2.0532, "step": 378000 }, { "epoch": 17.712763471514698, "grad_norm": 0.4514461159706116, "learning_rate": 3.718972571601757e-05, "loss": 2.0473, "step": 379000 }, { "epoch": 17.75949899518624, "grad_norm": 0.44332849979400635, "learning_rate": 3.643203515684194e-05, "loss": 2.053, "step": 380000 }, { "epoch": 17.806234518857785, "grad_norm": 0.431598424911499, "learning_rate": 3.567434459766631e-05, "loss": 2.048, "step": 381000 }, { "epoch": 17.852970042529325, "grad_norm": 0.4604801833629608, "learning_rate": 3.491741172904985e-05, "loss": 2.0541, "step": 382000 }, { "epoch": 17.89970556620087, "grad_norm": 0.4291286766529083, "learning_rate": 3.415972116987422e-05, "loss": 2.0486, "step": 383000 }, { "epoch": 17.946441089872412, "grad_norm": 0.46012356877326965, "learning_rate": 3.340203061069859e-05, "loss": 2.0515, "step": 384000 }, { "epoch": 17.993176613543955, "grad_norm": 0.44041863083839417, "learning_rate": 3.264509774208213e-05, "loss": 2.0582, "step": 385000 }, { "epoch": 18.0, "eval_accuracy": 0.5194157657375073, "eval_loss": 2.489042043685913, "eval_runtime": 184.3996, "eval_samples_per_second": 381.373, "eval_steps_per_second": 5.96, "step": 385146 }, { "epoch": 18.0399121372155, "grad_norm": 0.4347294270992279, "learning_rate": 3.1887407182906494e-05, "loss": 2.026, "step": 386000 }, { "epoch": 18.08664766088704, "grad_norm": 0.4946117699146271, "learning_rate": 3.113047431429004e-05, "loss": 2.0199, "step": 387000 }, { "epoch": 18.133383184558582, "grad_norm": 0.4774208068847656, "learning_rate": 3.0372783755114407e-05, "loss": 2.0277, "step": 388000 }, { "epoch": 18.180118708230125, "grad_norm": 0.42027777433395386, "learning_rate": 2.9615093195938773e-05, "loss": 2.027, "step": 389000 }, { "epoch": 18.22685423190167, "grad_norm": 0.47702670097351074, "learning_rate": 2.8857402636763145e-05, "loss": 2.0251, "step": 390000 }, { "epoch": 18.273589755573212, "grad_norm": 0.45635542273521423, "learning_rate": 2.8100469768146687e-05, "loss": 2.0282, "step": 391000 }, { "epoch": 18.320325279244756, "grad_norm": 0.42526108026504517, "learning_rate": 2.7343536899530232e-05, "loss": 2.0332, "step": 392000 }, { "epoch": 18.367060802916296, "grad_norm": 0.4558388292789459, "learning_rate": 2.6585846340354594e-05, "loss": 2.0351, "step": 393000 }, { "epoch": 18.41379632658784, "grad_norm": 0.42901089787483215, "learning_rate": 2.5828155781178966e-05, "loss": 2.0279, "step": 394000 }, { "epoch": 18.460531850259382, "grad_norm": 0.4340655207633972, "learning_rate": 2.5071222912562508e-05, "loss": 2.0378, "step": 395000 }, { "epoch": 18.507267373930926, "grad_norm": 0.4176914095878601, "learning_rate": 2.4313532353386873e-05, "loss": 2.0284, "step": 396000 }, { "epoch": 18.55400289760247, "grad_norm": 0.44839930534362793, "learning_rate": 2.3556599484770418e-05, "loss": 2.029, "step": 397000 }, { "epoch": 18.60073842127401, "grad_norm": 0.44085559248924255, "learning_rate": 2.2798908925594787e-05, "loss": 2.035, "step": 398000 }, { "epoch": 18.647473944945553, "grad_norm": 0.4037966728210449, "learning_rate": 2.2041218366419152e-05, "loss": 2.0359, "step": 399000 }, { "epoch": 18.694209468617096, "grad_norm": 0.4513247311115265, "learning_rate": 2.128352780724352e-05, "loss": 2.0357, "step": 400000 }, { "epoch": 18.74094499228864, "grad_norm": 0.4081607162952423, "learning_rate": 2.0525837248067887e-05, "loss": 2.0394, "step": 401000 }, { "epoch": 18.787680515960183, "grad_norm": 0.42668506503105164, "learning_rate": 1.976890437945143e-05, "loss": 2.0359, "step": 402000 }, { "epoch": 18.834416039631723, "grad_norm": 0.476225346326828, "learning_rate": 1.9011971510834973e-05, "loss": 2.0299, "step": 403000 }, { "epoch": 18.881151563303266, "grad_norm": 0.4597811698913574, "learning_rate": 1.825428095165934e-05, "loss": 2.0309, "step": 404000 }, { "epoch": 18.92788708697481, "grad_norm": 0.4811764359474182, "learning_rate": 1.7497348083042884e-05, "loss": 2.038, "step": 405000 }, { "epoch": 18.974622610646353, "grad_norm": 0.4466439485549927, "learning_rate": 1.673965752386725e-05, "loss": 2.0422, "step": 406000 }, { "epoch": 19.0, "eval_accuracy": 0.5193626231153677, "eval_loss": 2.4923205375671387, "eval_runtime": 184.7287, "eval_samples_per_second": 380.694, "eval_steps_per_second": 5.949, "step": 406543 }, { "epoch": 19.021358134317897, "grad_norm": 0.44981294870376587, "learning_rate": 1.5981966964691618e-05, "loss": 2.0278, "step": 407000 }, { "epoch": 19.068093657989436, "grad_norm": 0.41826167702674866, "learning_rate": 1.5225034096075161e-05, "loss": 2.0176, "step": 408000 }, { "epoch": 19.11482918166098, "grad_norm": 0.5001415014266968, "learning_rate": 1.4467343536899529e-05, "loss": 2.0121, "step": 409000 }, { "epoch": 19.161564705332523, "grad_norm": 0.47413039207458496, "learning_rate": 1.3709652977723896e-05, "loss": 2.0172, "step": 410000 }, { "epoch": 19.208300229004067, "grad_norm": 0.4496116638183594, "learning_rate": 1.2951962418548263e-05, "loss": 2.0206, "step": 411000 }, { "epoch": 19.25503575267561, "grad_norm": 0.413718581199646, "learning_rate": 1.2195029549931806e-05, "loss": 2.0237, "step": 412000 }, { "epoch": 19.30177127634715, "grad_norm": 0.4080525040626526, "learning_rate": 1.1437338990756173e-05, "loss": 2.0204, "step": 413000 }, { "epoch": 19.348506800018693, "grad_norm": 0.4642179608345032, "learning_rate": 1.0680406122139717e-05, "loss": 2.0251, "step": 414000 }, { "epoch": 19.395242323690237, "grad_norm": 0.43880292773246765, "learning_rate": 9.922715562964084e-06, "loss": 2.0234, "step": 415000 }, { "epoch": 19.44197784736178, "grad_norm": 0.43078604340553284, "learning_rate": 9.165025003788451e-06, "loss": 2.0157, "step": 416000 }, { "epoch": 19.488713371033324, "grad_norm": 0.4106679856777191, "learning_rate": 8.408092135171994e-06, "loss": 2.0227, "step": 417000 }, { "epoch": 19.535448894704864, "grad_norm": 0.40442559123039246, "learning_rate": 7.650401575996361e-06, "loss": 2.0226, "step": 418000 }, { "epoch": 19.582184418376407, "grad_norm": 0.43674561381340027, "learning_rate": 6.8934687073799055e-06, "loss": 2.0116, "step": 419000 }, { "epoch": 19.62891994204795, "grad_norm": 0.40172746777534485, "learning_rate": 6.135778148204273e-06, "loss": 2.0143, "step": 420000 }, { "epoch": 19.675655465719494, "grad_norm": 0.45278018712997437, "learning_rate": 5.37808758902864e-06, "loss": 2.0212, "step": 421000 }, { "epoch": 19.722390989391037, "grad_norm": 0.45401936769485474, "learning_rate": 4.621154720412183e-06, "loss": 2.0186, "step": 422000 }, { "epoch": 19.769126513062577, "grad_norm": 0.45410868525505066, "learning_rate": 3.86346416123655e-06, "loss": 2.0216, "step": 423000 }, { "epoch": 19.81586203673412, "grad_norm": 0.44420313835144043, "learning_rate": 3.1065312926200936e-06, "loss": 2.0232, "step": 424000 }, { "epoch": 19.862597560405664, "grad_norm": 0.43311986327171326, "learning_rate": 2.349598424003637e-06, "loss": 2.0206, "step": 425000 }, { "epoch": 19.909333084077208, "grad_norm": 0.40267816185951233, "learning_rate": 1.591907864828004e-06, "loss": 2.021, "step": 426000 }, { "epoch": 19.95606860774875, "grad_norm": 0.4614889621734619, "learning_rate": 8.342173056523715e-07, "loss": 2.0188, "step": 427000 }, { "epoch": 20.0, "eval_accuracy": 0.5192785875103685, "eval_loss": 2.4960787296295166, "eval_runtime": 184.2593, "eval_samples_per_second": 381.663, "eval_steps_per_second": 5.964, "step": 427940 }, { "epoch": 20.0, "step": 427940, "total_flos": 1.7890534785024e+18, "train_loss": 2.2675810582360225, "train_runtime": 92416.4259, "train_samples_per_second": 148.176, "train_steps_per_second": 4.631 } ], "logging_steps": 1000, "max_steps": 427940, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.7890534785024e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }