{ "best_metric": 0.2954551577568054, "best_model_checkpoint": "./cifar100_outputs/checkpoint-26565", "epoch": 5.0, "eval_steps": 500, "global_step": 26565, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0018821757952192735, "grad_norm": 2.8234074115753174, "learning_rate": 1.9992471296819125e-05, "loss": 4.6312, "step": 10 }, { "epoch": 0.003764351590438547, "grad_norm": 2.5866012573242188, "learning_rate": 1.9984942593638247e-05, "loss": 4.5949, "step": 20 }, { "epoch": 0.00564652738565782, "grad_norm": 2.6120123863220215, "learning_rate": 1.997741389045737e-05, "loss": 4.6003, "step": 30 }, { "epoch": 0.007528703180877094, "grad_norm": 2.558011293411255, "learning_rate": 1.9969885187276493e-05, "loss": 4.6029, "step": 40 }, { "epoch": 0.009410878976096368, "grad_norm": 2.4474682807922363, "learning_rate": 1.9962356484095616e-05, "loss": 4.6158, "step": 50 }, { "epoch": 0.01129305477131564, "grad_norm": 2.758237838745117, "learning_rate": 1.995482778091474e-05, "loss": 4.6014, "step": 60 }, { "epoch": 0.013175230566534914, "grad_norm": 4.711178779602051, "learning_rate": 1.9947299077733862e-05, "loss": 4.5894, "step": 70 }, { "epoch": 0.015057406361754188, "grad_norm": 2.3408522605895996, "learning_rate": 1.9939770374552985e-05, "loss": 4.5783, "step": 80 }, { "epoch": 0.01693958215697346, "grad_norm": 2.6499340534210205, "learning_rate": 1.9932241671372108e-05, "loss": 4.5679, "step": 90 }, { "epoch": 0.018821757952192736, "grad_norm": 2.65826678276062, "learning_rate": 1.992471296819123e-05, "loss": 4.5715, "step": 100 }, { "epoch": 0.020703933747412008, "grad_norm": 2.484968662261963, "learning_rate": 1.9917184265010354e-05, "loss": 4.5451, "step": 110 }, { "epoch": 0.02258610954263128, "grad_norm": 2.5142550468444824, "learning_rate": 1.9909655561829477e-05, "loss": 4.5675, "step": 120 }, { "epoch": 0.024468285337850556, "grad_norm": 2.3911235332489014, "learning_rate": 1.99021268586486e-05, "loss": 4.5623, "step": 130 }, { "epoch": 0.026350461133069828, "grad_norm": 2.7664413452148438, "learning_rate": 1.9894598155467723e-05, "loss": 4.5548, "step": 140 }, { "epoch": 0.028232636928289104, "grad_norm": 2.7500317096710205, "learning_rate": 1.9887069452286845e-05, "loss": 4.5317, "step": 150 }, { "epoch": 0.030114812723508376, "grad_norm": 2.9022419452667236, "learning_rate": 1.987954074910597e-05, "loss": 4.5442, "step": 160 }, { "epoch": 0.03199698851872765, "grad_norm": 3.4607958793640137, "learning_rate": 1.987201204592509e-05, "loss": 4.5409, "step": 170 }, { "epoch": 0.03387916431394692, "grad_norm": 2.7307426929473877, "learning_rate": 1.9864483342744214e-05, "loss": 4.5272, "step": 180 }, { "epoch": 0.0357613401091662, "grad_norm": 2.797895669937134, "learning_rate": 1.9856954639563337e-05, "loss": 4.5225, "step": 190 }, { "epoch": 0.03764351590438547, "grad_norm": 2.6407244205474854, "learning_rate": 1.984942593638246e-05, "loss": 4.505, "step": 200 }, { "epoch": 0.039525691699604744, "grad_norm": 2.401859760284424, "learning_rate": 1.9841897233201583e-05, "loss": 4.5198, "step": 210 }, { "epoch": 0.041407867494824016, "grad_norm": 2.945293664932251, "learning_rate": 1.9834368530020706e-05, "loss": 4.5059, "step": 220 }, { "epoch": 0.04329004329004329, "grad_norm": 2.723217248916626, "learning_rate": 1.982683982683983e-05, "loss": 4.4505, "step": 230 }, { "epoch": 0.04517221908526256, "grad_norm": 3.357862710952759, "learning_rate": 1.9819311123658952e-05, "loss": 4.4769, "step": 240 }, { "epoch": 0.04705439488048184, "grad_norm": 3.170531749725342, "learning_rate": 1.9811782420478075e-05, "loss": 4.47, "step": 250 }, { "epoch": 0.04893657067570111, "grad_norm": 2.830183744430542, "learning_rate": 1.9804253717297198e-05, "loss": 4.4742, "step": 260 }, { "epoch": 0.050818746470920384, "grad_norm": 2.7911784648895264, "learning_rate": 1.979672501411632e-05, "loss": 4.4557, "step": 270 }, { "epoch": 0.052700922266139656, "grad_norm": 2.965761423110962, "learning_rate": 1.9789196310935443e-05, "loss": 4.4504, "step": 280 }, { "epoch": 0.05458309806135893, "grad_norm": 3.0982887744903564, "learning_rate": 1.9781667607754566e-05, "loss": 4.4378, "step": 290 }, { "epoch": 0.05646527385657821, "grad_norm": 3.3339033126831055, "learning_rate": 1.977413890457369e-05, "loss": 4.4213, "step": 300 }, { "epoch": 0.05834744965179748, "grad_norm": 2.5607752799987793, "learning_rate": 1.9766610201392812e-05, "loss": 4.4296, "step": 310 }, { "epoch": 0.06022962544701675, "grad_norm": 2.7375972270965576, "learning_rate": 1.9759081498211935e-05, "loss": 4.4418, "step": 320 }, { "epoch": 0.062111801242236024, "grad_norm": 2.757209539413452, "learning_rate": 1.9751552795031058e-05, "loss": 4.3921, "step": 330 }, { "epoch": 0.0639939770374553, "grad_norm": 3.423086404800415, "learning_rate": 1.974402409185018e-05, "loss": 4.408, "step": 340 }, { "epoch": 0.06587615283267458, "grad_norm": 3.178119421005249, "learning_rate": 1.9736495388669304e-05, "loss": 4.3862, "step": 350 }, { "epoch": 0.06775832862789384, "grad_norm": 2.718017816543579, "learning_rate": 1.9728966685488427e-05, "loss": 4.3867, "step": 360 }, { "epoch": 0.06964050442311312, "grad_norm": 2.780374765396118, "learning_rate": 1.972143798230755e-05, "loss": 4.3705, "step": 370 }, { "epoch": 0.0715226802183324, "grad_norm": 3.7188100814819336, "learning_rate": 1.9713909279126673e-05, "loss": 4.3587, "step": 380 }, { "epoch": 0.07340485601355166, "grad_norm": 2.7755348682403564, "learning_rate": 1.9706380575945796e-05, "loss": 4.38, "step": 390 }, { "epoch": 0.07528703180877094, "grad_norm": 3.0622341632843018, "learning_rate": 1.969885187276492e-05, "loss": 4.3634, "step": 400 }, { "epoch": 0.07716920760399021, "grad_norm": 3.146724224090576, "learning_rate": 1.969132316958404e-05, "loss": 4.3677, "step": 410 }, { "epoch": 0.07905138339920949, "grad_norm": 4.22658634185791, "learning_rate": 1.9683794466403164e-05, "loss": 4.3087, "step": 420 }, { "epoch": 0.08093355919442875, "grad_norm": 3.4560253620147705, "learning_rate": 1.9676265763222287e-05, "loss": 4.3527, "step": 430 }, { "epoch": 0.08281573498964803, "grad_norm": 2.921839714050293, "learning_rate": 1.966873706004141e-05, "loss": 4.3253, "step": 440 }, { "epoch": 0.08469791078486731, "grad_norm": 2.827158212661743, "learning_rate": 1.9661208356860533e-05, "loss": 4.3189, "step": 450 }, { "epoch": 0.08658008658008658, "grad_norm": 2.8546581268310547, "learning_rate": 1.9653679653679656e-05, "loss": 4.2886, "step": 460 }, { "epoch": 0.08846226237530586, "grad_norm": 3.17734694480896, "learning_rate": 1.964615095049878e-05, "loss": 4.3264, "step": 470 }, { "epoch": 0.09034443817052512, "grad_norm": 3.0701353549957275, "learning_rate": 1.96386222473179e-05, "loss": 4.2996, "step": 480 }, { "epoch": 0.0922266139657444, "grad_norm": 2.914555549621582, "learning_rate": 1.9631093544137025e-05, "loss": 4.2455, "step": 490 }, { "epoch": 0.09410878976096368, "grad_norm": 2.8697192668914795, "learning_rate": 1.9623564840956148e-05, "loss": 4.2583, "step": 500 }, { "epoch": 0.09599096555618294, "grad_norm": 2.944056272506714, "learning_rate": 1.961603613777527e-05, "loss": 4.2599, "step": 510 }, { "epoch": 0.09787314135140222, "grad_norm": 3.0705983638763428, "learning_rate": 1.9608507434594394e-05, "loss": 4.2685, "step": 520 }, { "epoch": 0.09975531714662149, "grad_norm": 3.1787703037261963, "learning_rate": 1.9600978731413516e-05, "loss": 4.2272, "step": 530 }, { "epoch": 0.10163749294184077, "grad_norm": 3.236412763595581, "learning_rate": 1.959345002823264e-05, "loss": 4.251, "step": 540 }, { "epoch": 0.10351966873706005, "grad_norm": 3.3419723510742188, "learning_rate": 1.9585921325051762e-05, "loss": 4.2056, "step": 550 }, { "epoch": 0.10540184453227931, "grad_norm": 3.7011070251464844, "learning_rate": 1.9578392621870885e-05, "loss": 4.2403, "step": 560 }, { "epoch": 0.10728402032749859, "grad_norm": 3.077976703643799, "learning_rate": 1.9570863918690005e-05, "loss": 4.2163, "step": 570 }, { "epoch": 0.10916619612271786, "grad_norm": 3.373072624206543, "learning_rate": 1.956333521550913e-05, "loss": 4.2113, "step": 580 }, { "epoch": 0.11104837191793714, "grad_norm": 3.0897161960601807, "learning_rate": 1.9555806512328254e-05, "loss": 4.1671, "step": 590 }, { "epoch": 0.11293054771315642, "grad_norm": 3.42901873588562, "learning_rate": 1.9548277809147377e-05, "loss": 4.1616, "step": 600 }, { "epoch": 0.11481272350837568, "grad_norm": 3.4786670207977295, "learning_rate": 1.95407491059665e-05, "loss": 4.1452, "step": 610 }, { "epoch": 0.11669489930359496, "grad_norm": 2.832836151123047, "learning_rate": 1.9533220402785623e-05, "loss": 4.1742, "step": 620 }, { "epoch": 0.11857707509881422, "grad_norm": 3.0124964714050293, "learning_rate": 1.9525691699604746e-05, "loss": 4.1412, "step": 630 }, { "epoch": 0.1204592508940335, "grad_norm": 2.7018187046051025, "learning_rate": 1.951816299642387e-05, "loss": 4.1027, "step": 640 }, { "epoch": 0.12234142668925278, "grad_norm": 3.9656338691711426, "learning_rate": 1.951063429324299e-05, "loss": 4.1837, "step": 650 }, { "epoch": 0.12422360248447205, "grad_norm": 3.147698402404785, "learning_rate": 1.950310559006211e-05, "loss": 4.0989, "step": 660 }, { "epoch": 0.12610577827969133, "grad_norm": 3.232612371444702, "learning_rate": 1.9495576886881234e-05, "loss": 4.0819, "step": 670 }, { "epoch": 0.1279879540749106, "grad_norm": 3.327009916305542, "learning_rate": 1.948804818370036e-05, "loss": 4.0768, "step": 680 }, { "epoch": 0.12987012987012986, "grad_norm": 3.508767604827881, "learning_rate": 1.9480519480519483e-05, "loss": 4.0784, "step": 690 }, { "epoch": 0.13175230566534915, "grad_norm": 3.3220221996307373, "learning_rate": 1.9472990777338606e-05, "loss": 4.0878, "step": 700 }, { "epoch": 0.13363448146056842, "grad_norm": 2.8315317630767822, "learning_rate": 1.946546207415773e-05, "loss": 4.0156, "step": 710 }, { "epoch": 0.13551665725578768, "grad_norm": 3.6923279762268066, "learning_rate": 1.9457933370976852e-05, "loss": 4.0251, "step": 720 }, { "epoch": 0.13739883305100697, "grad_norm": 2.8323168754577637, "learning_rate": 1.9450404667795975e-05, "loss": 4.0082, "step": 730 }, { "epoch": 0.13928100884622624, "grad_norm": 3.0776381492614746, "learning_rate": 1.9442875964615098e-05, "loss": 4.0494, "step": 740 }, { "epoch": 0.1411631846414455, "grad_norm": 3.2516698837280273, "learning_rate": 1.9435347261434217e-05, "loss": 4.0256, "step": 750 }, { "epoch": 0.1430453604366648, "grad_norm": 5.407994747161865, "learning_rate": 1.942781855825334e-05, "loss": 3.9929, "step": 760 }, { "epoch": 0.14492753623188406, "grad_norm": 3.1175644397735596, "learning_rate": 1.9420289855072467e-05, "loss": 3.999, "step": 770 }, { "epoch": 0.14680971202710333, "grad_norm": 3.38260817527771, "learning_rate": 1.941276115189159e-05, "loss": 3.9711, "step": 780 }, { "epoch": 0.1486918878223226, "grad_norm": 3.698014259338379, "learning_rate": 1.9405232448710712e-05, "loss": 4.0025, "step": 790 }, { "epoch": 0.1505740636175419, "grad_norm": 3.224360466003418, "learning_rate": 1.9397703745529835e-05, "loss": 3.9694, "step": 800 }, { "epoch": 0.15245623941276115, "grad_norm": 3.315228223800659, "learning_rate": 1.9390175042348958e-05, "loss": 3.9376, "step": 810 }, { "epoch": 0.15433841520798042, "grad_norm": 4.7839531898498535, "learning_rate": 1.938264633916808e-05, "loss": 3.9115, "step": 820 }, { "epoch": 0.1562205910031997, "grad_norm": 3.185082197189331, "learning_rate": 1.93751176359872e-05, "loss": 3.93, "step": 830 }, { "epoch": 0.15810276679841898, "grad_norm": 3.3744490146636963, "learning_rate": 1.9367588932806324e-05, "loss": 3.9689, "step": 840 }, { "epoch": 0.15998494259363824, "grad_norm": 3.343932867050171, "learning_rate": 1.9360060229625447e-05, "loss": 3.9531, "step": 850 }, { "epoch": 0.1618671183888575, "grad_norm": 3.245112895965576, "learning_rate": 1.9352531526444573e-05, "loss": 4.0125, "step": 860 }, { "epoch": 0.1637492941840768, "grad_norm": 3.1948068141937256, "learning_rate": 1.9345002823263696e-05, "loss": 3.7966, "step": 870 }, { "epoch": 0.16563146997929606, "grad_norm": 3.6105539798736572, "learning_rate": 1.933747412008282e-05, "loss": 3.8701, "step": 880 }, { "epoch": 0.16751364577451533, "grad_norm": 3.940016031265259, "learning_rate": 1.932994541690194e-05, "loss": 3.8866, "step": 890 }, { "epoch": 0.16939582156973462, "grad_norm": 4.30803918838501, "learning_rate": 1.9322416713721065e-05, "loss": 3.9068, "step": 900 }, { "epoch": 0.1712779973649539, "grad_norm": 3.2661314010620117, "learning_rate": 1.9314888010540187e-05, "loss": 3.8431, "step": 910 }, { "epoch": 0.17316017316017315, "grad_norm": 4.278161525726318, "learning_rate": 1.9307359307359307e-05, "loss": 3.8407, "step": 920 }, { "epoch": 0.17504234895539245, "grad_norm": 3.765481472015381, "learning_rate": 1.929983060417843e-05, "loss": 3.8293, "step": 930 }, { "epoch": 0.1769245247506117, "grad_norm": 4.025678634643555, "learning_rate": 1.9292301900997553e-05, "loss": 3.8125, "step": 940 }, { "epoch": 0.17880670054583098, "grad_norm": 3.7228808403015137, "learning_rate": 1.9284773197816676e-05, "loss": 3.7891, "step": 950 }, { "epoch": 0.18068887634105024, "grad_norm": 3.49621844291687, "learning_rate": 1.9277244494635802e-05, "loss": 3.7859, "step": 960 }, { "epoch": 0.18257105213626953, "grad_norm": 3.645171880722046, "learning_rate": 1.9269715791454925e-05, "loss": 3.7994, "step": 970 }, { "epoch": 0.1844532279314888, "grad_norm": 4.006845951080322, "learning_rate": 1.9262187088274048e-05, "loss": 3.7183, "step": 980 }, { "epoch": 0.18633540372670807, "grad_norm": 3.9820406436920166, "learning_rate": 1.925465838509317e-05, "loss": 3.8167, "step": 990 }, { "epoch": 0.18821757952192736, "grad_norm": 3.6761341094970703, "learning_rate": 1.9247129681912294e-05, "loss": 3.7398, "step": 1000 }, { "epoch": 0.19009975531714662, "grad_norm": 4.603309154510498, "learning_rate": 1.9239600978731413e-05, "loss": 3.7817, "step": 1010 }, { "epoch": 0.1919819311123659, "grad_norm": 3.3203389644622803, "learning_rate": 1.9232072275550536e-05, "loss": 3.7976, "step": 1020 }, { "epoch": 0.19386410690758518, "grad_norm": 3.973386287689209, "learning_rate": 1.922454357236966e-05, "loss": 3.746, "step": 1030 }, { "epoch": 0.19574628270280445, "grad_norm": 5.21986722946167, "learning_rate": 1.9217014869188782e-05, "loss": 3.6609, "step": 1040 }, { "epoch": 0.1976284584980237, "grad_norm": 8.304903030395508, "learning_rate": 1.920948616600791e-05, "loss": 3.693, "step": 1050 }, { "epoch": 0.19951063429324298, "grad_norm": 3.812041759490967, "learning_rate": 1.920195746282703e-05, "loss": 3.748, "step": 1060 }, { "epoch": 0.20139281008846227, "grad_norm": 3.4189064502716064, "learning_rate": 1.9194428759646154e-05, "loss": 3.7635, "step": 1070 }, { "epoch": 0.20327498588368154, "grad_norm": 3.6516013145446777, "learning_rate": 1.9186900056465277e-05, "loss": 3.6519, "step": 1080 }, { "epoch": 0.2051571616789008, "grad_norm": 3.0502750873565674, "learning_rate": 1.9179371353284397e-05, "loss": 3.7235, "step": 1090 }, { "epoch": 0.2070393374741201, "grad_norm": 7.05238151550293, "learning_rate": 1.917184265010352e-05, "loss": 3.7494, "step": 1100 }, { "epoch": 0.20892151326933936, "grad_norm": 5.699125289916992, "learning_rate": 1.9164313946922643e-05, "loss": 3.6823, "step": 1110 }, { "epoch": 0.21080368906455862, "grad_norm": 5.011223793029785, "learning_rate": 1.9156785243741765e-05, "loss": 3.6604, "step": 1120 }, { "epoch": 0.2126858648597779, "grad_norm": 4.8337788581848145, "learning_rate": 1.914925654056089e-05, "loss": 3.5912, "step": 1130 }, { "epoch": 0.21456804065499718, "grad_norm": 3.9982621669769287, "learning_rate": 1.9141727837380015e-05, "loss": 3.636, "step": 1140 }, { "epoch": 0.21645021645021645, "grad_norm": 6.422266006469727, "learning_rate": 1.9134199134199138e-05, "loss": 3.6918, "step": 1150 }, { "epoch": 0.2183323922454357, "grad_norm": 3.5577023029327393, "learning_rate": 1.912667043101826e-05, "loss": 3.645, "step": 1160 }, { "epoch": 0.220214568040655, "grad_norm": 3.5199527740478516, "learning_rate": 1.9119141727837383e-05, "loss": 3.5279, "step": 1170 }, { "epoch": 0.22209674383587427, "grad_norm": 3.93874192237854, "learning_rate": 1.9111613024656503e-05, "loss": 3.591, "step": 1180 }, { "epoch": 0.22397891963109354, "grad_norm": 3.5050225257873535, "learning_rate": 1.9104084321475626e-05, "loss": 3.6323, "step": 1190 }, { "epoch": 0.22586109542631283, "grad_norm": 3.3720874786376953, "learning_rate": 1.909655561829475e-05, "loss": 3.6192, "step": 1200 }, { "epoch": 0.2277432712215321, "grad_norm": 2.833101272583008, "learning_rate": 1.9089026915113872e-05, "loss": 3.544, "step": 1210 }, { "epoch": 0.22962544701675136, "grad_norm": 3.4263570308685303, "learning_rate": 1.9081498211932995e-05, "loss": 3.606, "step": 1220 }, { "epoch": 0.23150762281197063, "grad_norm": 4.502038955688477, "learning_rate": 1.907396950875212e-05, "loss": 3.5418, "step": 1230 }, { "epoch": 0.23338979860718992, "grad_norm": 4.862933158874512, "learning_rate": 1.9066440805571244e-05, "loss": 3.5316, "step": 1240 }, { "epoch": 0.23527197440240918, "grad_norm": 7.199465751647949, "learning_rate": 1.9058912102390367e-05, "loss": 3.6576, "step": 1250 }, { "epoch": 0.23715415019762845, "grad_norm": 8.997977256774902, "learning_rate": 1.905138339920949e-05, "loss": 3.3866, "step": 1260 }, { "epoch": 0.23903632599284774, "grad_norm": 3.7024660110473633, "learning_rate": 1.904385469602861e-05, "loss": 3.633, "step": 1270 }, { "epoch": 0.240918501788067, "grad_norm": 4.383315563201904, "learning_rate": 1.9036325992847732e-05, "loss": 3.5204, "step": 1280 }, { "epoch": 0.24280067758328627, "grad_norm": 3.845693588256836, "learning_rate": 1.9028797289666855e-05, "loss": 3.4319, "step": 1290 }, { "epoch": 0.24468285337850557, "grad_norm": 5.626176357269287, "learning_rate": 1.9021268586485978e-05, "loss": 3.6167, "step": 1300 }, { "epoch": 0.24656502917372483, "grad_norm": 3.270461320877075, "learning_rate": 1.90137398833051e-05, "loss": 3.5568, "step": 1310 }, { "epoch": 0.2484472049689441, "grad_norm": 6.196926116943359, "learning_rate": 1.9006211180124224e-05, "loss": 3.5659, "step": 1320 }, { "epoch": 0.2503293807641634, "grad_norm": 3.670217514038086, "learning_rate": 1.899868247694335e-05, "loss": 3.5009, "step": 1330 }, { "epoch": 0.25221155655938265, "grad_norm": 3.7540853023529053, "learning_rate": 1.8991153773762473e-05, "loss": 3.4929, "step": 1340 }, { "epoch": 0.2540937323546019, "grad_norm": 5.680825710296631, "learning_rate": 1.8983625070581596e-05, "loss": 3.4018, "step": 1350 }, { "epoch": 0.2559759081498212, "grad_norm": 5.842689514160156, "learning_rate": 1.8976096367400716e-05, "loss": 3.4934, "step": 1360 }, { "epoch": 0.25785808394504045, "grad_norm": 3.461512804031372, "learning_rate": 1.896856766421984e-05, "loss": 3.526, "step": 1370 }, { "epoch": 0.2597402597402597, "grad_norm": 4.65621280670166, "learning_rate": 1.896103896103896e-05, "loss": 3.4837, "step": 1380 }, { "epoch": 0.26162243553547904, "grad_norm": 3.209920644760132, "learning_rate": 1.8953510257858084e-05, "loss": 3.475, "step": 1390 }, { "epoch": 0.2635046113306983, "grad_norm": 4.2360615730285645, "learning_rate": 1.8945981554677207e-05, "loss": 3.3463, "step": 1400 }, { "epoch": 0.26538678712591757, "grad_norm": 3.977550506591797, "learning_rate": 1.893845285149633e-05, "loss": 3.4359, "step": 1410 }, { "epoch": 0.26726896292113683, "grad_norm": 3.533844470977783, "learning_rate": 1.8930924148315456e-05, "loss": 3.3868, "step": 1420 }, { "epoch": 0.2691511387163561, "grad_norm": 4.788166522979736, "learning_rate": 1.892339544513458e-05, "loss": 3.3756, "step": 1430 }, { "epoch": 0.27103331451157536, "grad_norm": 4.711440086364746, "learning_rate": 1.89158667419537e-05, "loss": 3.5177, "step": 1440 }, { "epoch": 0.27291549030679463, "grad_norm": 3.7958436012268066, "learning_rate": 1.8908338038772822e-05, "loss": 3.3974, "step": 1450 }, { "epoch": 0.27479766610201395, "grad_norm": 5.754987716674805, "learning_rate": 1.8900809335591945e-05, "loss": 3.3473, "step": 1460 }, { "epoch": 0.2766798418972332, "grad_norm": 6.3976311683654785, "learning_rate": 1.8893280632411068e-05, "loss": 3.4912, "step": 1470 }, { "epoch": 0.2785620176924525, "grad_norm": 6.161326885223389, "learning_rate": 1.888575192923019e-05, "loss": 3.3726, "step": 1480 }, { "epoch": 0.28044419348767174, "grad_norm": 4.482143878936768, "learning_rate": 1.8878223226049314e-05, "loss": 3.3857, "step": 1490 }, { "epoch": 0.282326369282891, "grad_norm": 6.511773586273193, "learning_rate": 1.8870694522868436e-05, "loss": 3.3012, "step": 1500 }, { "epoch": 0.2842085450781103, "grad_norm": 3.7000627517700195, "learning_rate": 1.8863165819687563e-05, "loss": 3.4734, "step": 1510 }, { "epoch": 0.2860907208733296, "grad_norm": 3.221539258956909, "learning_rate": 1.8855637116506686e-05, "loss": 3.3935, "step": 1520 }, { "epoch": 0.28797289666854886, "grad_norm": 4.660919189453125, "learning_rate": 1.8848108413325805e-05, "loss": 3.4218, "step": 1530 }, { "epoch": 0.2898550724637681, "grad_norm": 3.7305450439453125, "learning_rate": 1.8840579710144928e-05, "loss": 3.3499, "step": 1540 }, { "epoch": 0.2917372482589874, "grad_norm": 4.458515644073486, "learning_rate": 1.883305100696405e-05, "loss": 3.243, "step": 1550 }, { "epoch": 0.29361942405420666, "grad_norm": 4.746747970581055, "learning_rate": 1.8825522303783174e-05, "loss": 3.4016, "step": 1560 }, { "epoch": 0.2955015998494259, "grad_norm": 4.314861297607422, "learning_rate": 1.8817993600602297e-05, "loss": 3.3974, "step": 1570 }, { "epoch": 0.2973837756446452, "grad_norm": 3.778343677520752, "learning_rate": 1.881046489742142e-05, "loss": 3.3342, "step": 1580 }, { "epoch": 0.2992659514398645, "grad_norm": 3.7303268909454346, "learning_rate": 1.8802936194240543e-05, "loss": 3.3313, "step": 1590 }, { "epoch": 0.3011481272350838, "grad_norm": 3.9831275939941406, "learning_rate": 1.8795407491059666e-05, "loss": 3.214, "step": 1600 }, { "epoch": 0.30303030303030304, "grad_norm": 4.6280975341796875, "learning_rate": 1.8787878787878792e-05, "loss": 3.2715, "step": 1610 }, { "epoch": 0.3049124788255223, "grad_norm": 5.433199405670166, "learning_rate": 1.878035008469791e-05, "loss": 3.4141, "step": 1620 }, { "epoch": 0.30679465462074157, "grad_norm": 3.2537083625793457, "learning_rate": 1.8772821381517034e-05, "loss": 3.2985, "step": 1630 }, { "epoch": 0.30867683041596083, "grad_norm": 6.475714683532715, "learning_rate": 1.8765292678336157e-05, "loss": 3.3312, "step": 1640 }, { "epoch": 0.3105590062111801, "grad_norm": 3.690768003463745, "learning_rate": 1.875776397515528e-05, "loss": 3.4284, "step": 1650 }, { "epoch": 0.3124411820063994, "grad_norm": 4.372195243835449, "learning_rate": 1.8750235271974403e-05, "loss": 3.2672, "step": 1660 }, { "epoch": 0.3143233578016187, "grad_norm": 3.0955049991607666, "learning_rate": 1.8742706568793526e-05, "loss": 3.2831, "step": 1670 }, { "epoch": 0.31620553359683795, "grad_norm": 5.1684112548828125, "learning_rate": 1.873517786561265e-05, "loss": 3.2791, "step": 1680 }, { "epoch": 0.3180877093920572, "grad_norm": 3.559154510498047, "learning_rate": 1.8727649162431772e-05, "loss": 3.2338, "step": 1690 }, { "epoch": 0.3199698851872765, "grad_norm": 6.47905158996582, "learning_rate": 1.8720120459250895e-05, "loss": 3.2949, "step": 1700 }, { "epoch": 0.32185206098249575, "grad_norm": 4.15554141998291, "learning_rate": 1.8712591756070018e-05, "loss": 3.2074, "step": 1710 }, { "epoch": 0.323734236777715, "grad_norm": 5.843398094177246, "learning_rate": 1.870506305288914e-05, "loss": 3.1397, "step": 1720 }, { "epoch": 0.32561641257293433, "grad_norm": 4.45064640045166, "learning_rate": 1.8697534349708264e-05, "loss": 3.2493, "step": 1730 }, { "epoch": 0.3274985883681536, "grad_norm": 3.1222758293151855, "learning_rate": 1.8690005646527387e-05, "loss": 3.2675, "step": 1740 }, { "epoch": 0.32938076416337286, "grad_norm": 4.332676410675049, "learning_rate": 1.868247694334651e-05, "loss": 3.2176, "step": 1750 }, { "epoch": 0.33126293995859213, "grad_norm": 6.784658908843994, "learning_rate": 1.8674948240165632e-05, "loss": 3.194, "step": 1760 }, { "epoch": 0.3331451157538114, "grad_norm": 3.7953319549560547, "learning_rate": 1.8667419536984755e-05, "loss": 3.2735, "step": 1770 }, { "epoch": 0.33502729154903066, "grad_norm": 3.5565590858459473, "learning_rate": 1.8659890833803878e-05, "loss": 3.2094, "step": 1780 }, { "epoch": 0.33690946734425, "grad_norm": 3.9369053840637207, "learning_rate": 1.8652362130623e-05, "loss": 3.0813, "step": 1790 }, { "epoch": 0.33879164313946925, "grad_norm": 7.771551609039307, "learning_rate": 1.8644833427442124e-05, "loss": 3.1139, "step": 1800 }, { "epoch": 0.3406738189346885, "grad_norm": 4.863523960113525, "learning_rate": 1.8637304724261247e-05, "loss": 3.2591, "step": 1810 }, { "epoch": 0.3425559947299078, "grad_norm": 7.337231636047363, "learning_rate": 1.862977602108037e-05, "loss": 3.1937, "step": 1820 }, { "epoch": 0.34443817052512704, "grad_norm": 4.414863109588623, "learning_rate": 1.8622247317899493e-05, "loss": 3.1138, "step": 1830 }, { "epoch": 0.3463203463203463, "grad_norm": 5.118075370788574, "learning_rate": 1.8614718614718616e-05, "loss": 3.1822, "step": 1840 }, { "epoch": 0.34820252211556557, "grad_norm": 5.352917194366455, "learning_rate": 1.860718991153774e-05, "loss": 3.1788, "step": 1850 }, { "epoch": 0.3500846979107849, "grad_norm": 8.42982006072998, "learning_rate": 1.859966120835686e-05, "loss": 3.1031, "step": 1860 }, { "epoch": 0.35196687370600416, "grad_norm": 7.573580741882324, "learning_rate": 1.8592132505175985e-05, "loss": 3.1666, "step": 1870 }, { "epoch": 0.3538490495012234, "grad_norm": 3.9582431316375732, "learning_rate": 1.8584603801995107e-05, "loss": 3.1233, "step": 1880 }, { "epoch": 0.3557312252964427, "grad_norm": 3.29245924949646, "learning_rate": 1.857707509881423e-05, "loss": 3.1, "step": 1890 }, { "epoch": 0.35761340109166195, "grad_norm": 4.551001071929932, "learning_rate": 1.8569546395633353e-05, "loss": 3.1246, "step": 1900 }, { "epoch": 0.3594955768868812, "grad_norm": 3.6506519317626953, "learning_rate": 1.8562017692452476e-05, "loss": 2.9412, "step": 1910 }, { "epoch": 0.3613777526821005, "grad_norm": 4.867851257324219, "learning_rate": 1.85544889892716e-05, "loss": 3.1142, "step": 1920 }, { "epoch": 0.3632599284773198, "grad_norm": 6.288889408111572, "learning_rate": 1.8546960286090722e-05, "loss": 3.1336, "step": 1930 }, { "epoch": 0.36514210427253907, "grad_norm": 4.735827922821045, "learning_rate": 1.8539431582909845e-05, "loss": 2.9844, "step": 1940 }, { "epoch": 0.36702428006775834, "grad_norm": 9.145768165588379, "learning_rate": 1.8531902879728968e-05, "loss": 3.1812, "step": 1950 }, { "epoch": 0.3689064558629776, "grad_norm": 4.640198230743408, "learning_rate": 1.852437417654809e-05, "loss": 3.0759, "step": 1960 }, { "epoch": 0.37078863165819687, "grad_norm": 5.206761837005615, "learning_rate": 1.8516845473367214e-05, "loss": 3.1733, "step": 1970 }, { "epoch": 0.37267080745341613, "grad_norm": 8.24525260925293, "learning_rate": 1.8509316770186337e-05, "loss": 2.8758, "step": 1980 }, { "epoch": 0.3745529832486354, "grad_norm": 6.640933990478516, "learning_rate": 1.850178806700546e-05, "loss": 3.156, "step": 1990 }, { "epoch": 0.3764351590438547, "grad_norm": 3.1573383808135986, "learning_rate": 1.8494259363824583e-05, "loss": 3.0404, "step": 2000 }, { "epoch": 0.378317334839074, "grad_norm": 5.134864807128906, "learning_rate": 1.8486730660643705e-05, "loss": 2.9695, "step": 2010 }, { "epoch": 0.38019951063429325, "grad_norm": 7.871883392333984, "learning_rate": 1.847920195746283e-05, "loss": 3.1814, "step": 2020 }, { "epoch": 0.3820816864295125, "grad_norm": 12.194758415222168, "learning_rate": 1.847167325428195e-05, "loss": 3.0276, "step": 2030 }, { "epoch": 0.3839638622247318, "grad_norm": 6.0320820808410645, "learning_rate": 1.8464144551101074e-05, "loss": 2.9643, "step": 2040 }, { "epoch": 0.38584603801995104, "grad_norm": 5.726856231689453, "learning_rate": 1.8456615847920197e-05, "loss": 2.9393, "step": 2050 }, { "epoch": 0.38772821381517036, "grad_norm": 3.55429744720459, "learning_rate": 1.844908714473932e-05, "loss": 3.1167, "step": 2060 }, { "epoch": 0.38961038961038963, "grad_norm": 7.710501194000244, "learning_rate": 1.8441558441558443e-05, "loss": 3.0125, "step": 2070 }, { "epoch": 0.3914925654056089, "grad_norm": 6.08244514465332, "learning_rate": 1.8434029738377566e-05, "loss": 3.0516, "step": 2080 }, { "epoch": 0.39337474120082816, "grad_norm": 5.4757585525512695, "learning_rate": 1.842650103519669e-05, "loss": 2.8831, "step": 2090 }, { "epoch": 0.3952569169960474, "grad_norm": 5.588348865509033, "learning_rate": 1.8418972332015812e-05, "loss": 2.963, "step": 2100 }, { "epoch": 0.3971390927912667, "grad_norm": 8.077258110046387, "learning_rate": 1.8411443628834935e-05, "loss": 3.085, "step": 2110 }, { "epoch": 0.39902126858648596, "grad_norm": 3.8269598484039307, "learning_rate": 1.8403914925654058e-05, "loss": 2.9913, "step": 2120 }, { "epoch": 0.4009034443817053, "grad_norm": 3.7287516593933105, "learning_rate": 1.839638622247318e-05, "loss": 3.035, "step": 2130 }, { "epoch": 0.40278562017692454, "grad_norm": 5.027886390686035, "learning_rate": 1.8388857519292303e-05, "loss": 2.9356, "step": 2140 }, { "epoch": 0.4046677959721438, "grad_norm": 5.572052478790283, "learning_rate": 1.8381328816111426e-05, "loss": 2.8601, "step": 2150 }, { "epoch": 0.40654997176736307, "grad_norm": 6.4008355140686035, "learning_rate": 1.837380011293055e-05, "loss": 3.0857, "step": 2160 }, { "epoch": 0.40843214756258234, "grad_norm": 4.075533866882324, "learning_rate": 1.8366271409749672e-05, "loss": 2.9145, "step": 2170 }, { "epoch": 0.4103143233578016, "grad_norm": 7.462157249450684, "learning_rate": 1.8358742706568795e-05, "loss": 2.9953, "step": 2180 }, { "epoch": 0.41219649915302087, "grad_norm": 4.182966232299805, "learning_rate": 1.8351214003387918e-05, "loss": 3.0412, "step": 2190 }, { "epoch": 0.4140786749482402, "grad_norm": 10.89699649810791, "learning_rate": 1.834368530020704e-05, "loss": 2.9925, "step": 2200 }, { "epoch": 0.41596085074345945, "grad_norm": 5.336112022399902, "learning_rate": 1.8336156597026164e-05, "loss": 2.9516, "step": 2210 }, { "epoch": 0.4178430265386787, "grad_norm": 5.08997917175293, "learning_rate": 1.8328627893845287e-05, "loss": 2.8662, "step": 2220 }, { "epoch": 0.419725202333898, "grad_norm": 8.328145027160645, "learning_rate": 1.832109919066441e-05, "loss": 2.844, "step": 2230 }, { "epoch": 0.42160737812911725, "grad_norm": 4.348377227783203, "learning_rate": 1.8313570487483533e-05, "loss": 2.9414, "step": 2240 }, { "epoch": 0.4234895539243365, "grad_norm": 5.964900493621826, "learning_rate": 1.8306041784302656e-05, "loss": 2.9333, "step": 2250 }, { "epoch": 0.4253717297195558, "grad_norm": 8.116161346435547, "learning_rate": 1.829851308112178e-05, "loss": 2.8507, "step": 2260 }, { "epoch": 0.4272539055147751, "grad_norm": 5.363483428955078, "learning_rate": 1.82909843779409e-05, "loss": 2.7429, "step": 2270 }, { "epoch": 0.42913608130999437, "grad_norm": 3.6393518447875977, "learning_rate": 1.8283455674760024e-05, "loss": 2.9543, "step": 2280 }, { "epoch": 0.43101825710521363, "grad_norm": 11.873992919921875, "learning_rate": 1.8275926971579147e-05, "loss": 2.9146, "step": 2290 }, { "epoch": 0.4329004329004329, "grad_norm": 6.46327018737793, "learning_rate": 1.826839826839827e-05, "loss": 2.8802, "step": 2300 }, { "epoch": 0.43478260869565216, "grad_norm": 4.694522857666016, "learning_rate": 1.8260869565217393e-05, "loss": 2.7743, "step": 2310 }, { "epoch": 0.4366647844908714, "grad_norm": 5.260149002075195, "learning_rate": 1.8253340862036516e-05, "loss": 2.7807, "step": 2320 }, { "epoch": 0.43854696028609075, "grad_norm": 5.388479709625244, "learning_rate": 1.824581215885564e-05, "loss": 2.7548, "step": 2330 }, { "epoch": 0.44042913608131, "grad_norm": 4.184902191162109, "learning_rate": 1.8238283455674762e-05, "loss": 2.8732, "step": 2340 }, { "epoch": 0.4423113118765293, "grad_norm": 6.423655986785889, "learning_rate": 1.8230754752493885e-05, "loss": 2.7849, "step": 2350 }, { "epoch": 0.44419348767174854, "grad_norm": 8.165117263793945, "learning_rate": 1.8223226049313008e-05, "loss": 2.7968, "step": 2360 }, { "epoch": 0.4460756634669678, "grad_norm": 4.645898342132568, "learning_rate": 1.821569734613213e-05, "loss": 2.9012, "step": 2370 }, { "epoch": 0.4479578392621871, "grad_norm": 5.929998397827148, "learning_rate": 1.8208168642951254e-05, "loss": 2.7395, "step": 2380 }, { "epoch": 0.44984001505740634, "grad_norm": 3.953486442565918, "learning_rate": 1.8200639939770376e-05, "loss": 2.8225, "step": 2390 }, { "epoch": 0.45172219085262566, "grad_norm": 6.943857192993164, "learning_rate": 1.81931112365895e-05, "loss": 2.9133, "step": 2400 }, { "epoch": 0.4536043666478449, "grad_norm": 6.260727405548096, "learning_rate": 1.8185582533408622e-05, "loss": 2.8366, "step": 2410 }, { "epoch": 0.4554865424430642, "grad_norm": 9.32252025604248, "learning_rate": 1.8178053830227745e-05, "loss": 2.7906, "step": 2420 }, { "epoch": 0.45736871823828346, "grad_norm": 4.764776706695557, "learning_rate": 1.8170525127046868e-05, "loss": 2.7746, "step": 2430 }, { "epoch": 0.4592508940335027, "grad_norm": 5.310938358306885, "learning_rate": 1.816299642386599e-05, "loss": 2.8342, "step": 2440 }, { "epoch": 0.461133069828722, "grad_norm": 4.960974216461182, "learning_rate": 1.8155467720685114e-05, "loss": 2.8173, "step": 2450 }, { "epoch": 0.46301524562394125, "grad_norm": 10.526418685913086, "learning_rate": 1.8147939017504237e-05, "loss": 2.718, "step": 2460 }, { "epoch": 0.4648974214191606, "grad_norm": 3.5313024520874023, "learning_rate": 1.814041031432336e-05, "loss": 2.6789, "step": 2470 }, { "epoch": 0.46677959721437984, "grad_norm": 5.258660793304443, "learning_rate": 1.8132881611142483e-05, "loss": 2.7351, "step": 2480 }, { "epoch": 0.4686617730095991, "grad_norm": 3.7664971351623535, "learning_rate": 1.8125352907961606e-05, "loss": 2.7529, "step": 2490 }, { "epoch": 0.47054394880481837, "grad_norm": 9.83526611328125, "learning_rate": 1.811782420478073e-05, "loss": 2.7291, "step": 2500 }, { "epoch": 0.47242612460003763, "grad_norm": 5.410508155822754, "learning_rate": 1.811029550159985e-05, "loss": 2.5376, "step": 2510 }, { "epoch": 0.4743083003952569, "grad_norm": 6.245204925537109, "learning_rate": 1.8102766798418974e-05, "loss": 2.7001, "step": 2520 }, { "epoch": 0.47619047619047616, "grad_norm": 10.058247566223145, "learning_rate": 1.8095238095238097e-05, "loss": 2.5141, "step": 2530 }, { "epoch": 0.4780726519856955, "grad_norm": 3.719804286956787, "learning_rate": 1.808770939205722e-05, "loss": 2.6479, "step": 2540 }, { "epoch": 0.47995482778091475, "grad_norm": 6.557003498077393, "learning_rate": 1.8080180688876343e-05, "loss": 2.6833, "step": 2550 }, { "epoch": 0.481837003576134, "grad_norm": 4.292045593261719, "learning_rate": 1.8072651985695466e-05, "loss": 2.6688, "step": 2560 }, { "epoch": 0.4837191793713533, "grad_norm": 5.613296031951904, "learning_rate": 1.8065123282514586e-05, "loss": 2.7906, "step": 2570 }, { "epoch": 0.48560135516657255, "grad_norm": 10.547001838684082, "learning_rate": 1.8057594579333712e-05, "loss": 2.5276, "step": 2580 }, { "epoch": 0.4874835309617918, "grad_norm": 4.5769782066345215, "learning_rate": 1.8050065876152835e-05, "loss": 2.6816, "step": 2590 }, { "epoch": 0.48936570675701113, "grad_norm": 6.050182342529297, "learning_rate": 1.8042537172971958e-05, "loss": 2.6342, "step": 2600 }, { "epoch": 0.4912478825522304, "grad_norm": 3.172720432281494, "learning_rate": 1.803500846979108e-05, "loss": 2.7868, "step": 2610 }, { "epoch": 0.49313005834744966, "grad_norm": 7.896243095397949, "learning_rate": 1.8027479766610204e-05, "loss": 2.6741, "step": 2620 }, { "epoch": 0.4950122341426689, "grad_norm": 6.55963659286499, "learning_rate": 1.8019951063429327e-05, "loss": 2.6136, "step": 2630 }, { "epoch": 0.4968944099378882, "grad_norm": 4.171892166137695, "learning_rate": 1.801242236024845e-05, "loss": 2.5573, "step": 2640 }, { "epoch": 0.49877658573310746, "grad_norm": 8.071471214294434, "learning_rate": 1.8004893657067572e-05, "loss": 2.7103, "step": 2650 }, { "epoch": 0.5006587615283268, "grad_norm": 5.946764945983887, "learning_rate": 1.7997364953886692e-05, "loss": 2.6454, "step": 2660 }, { "epoch": 0.502540937323546, "grad_norm": 7.082939624786377, "learning_rate": 1.7989836250705818e-05, "loss": 2.6345, "step": 2670 }, { "epoch": 0.5044231131187653, "grad_norm": 5.81397008895874, "learning_rate": 1.798230754752494e-05, "loss": 2.6742, "step": 2680 }, { "epoch": 0.5063052889139845, "grad_norm": 11.386290550231934, "learning_rate": 1.7974778844344064e-05, "loss": 2.6589, "step": 2690 }, { "epoch": 0.5081874647092038, "grad_norm": 8.060523986816406, "learning_rate": 1.7967250141163187e-05, "loss": 2.4929, "step": 2700 }, { "epoch": 0.5100696405044232, "grad_norm": 4.004101753234863, "learning_rate": 1.795972143798231e-05, "loss": 2.6427, "step": 2710 }, { "epoch": 0.5119518162996424, "grad_norm": 4.872875213623047, "learning_rate": 1.7952192734801433e-05, "loss": 2.6538, "step": 2720 }, { "epoch": 0.5138339920948617, "grad_norm": 5.859134197235107, "learning_rate": 1.7944664031620556e-05, "loss": 2.736, "step": 2730 }, { "epoch": 0.5157161678900809, "grad_norm": 12.146455764770508, "learning_rate": 1.793713532843968e-05, "loss": 2.5728, "step": 2740 }, { "epoch": 0.5175983436853002, "grad_norm": 7.305758476257324, "learning_rate": 1.7929606625258798e-05, "loss": 2.5778, "step": 2750 }, { "epoch": 0.5194805194805194, "grad_norm": 3.7814443111419678, "learning_rate": 1.792207792207792e-05, "loss": 2.5947, "step": 2760 }, { "epoch": 0.5213626952757388, "grad_norm": 3.1260640621185303, "learning_rate": 1.7914549218897047e-05, "loss": 2.6592, "step": 2770 }, { "epoch": 0.5232448710709581, "grad_norm": 5.482910633087158, "learning_rate": 1.790702051571617e-05, "loss": 2.546, "step": 2780 }, { "epoch": 0.5251270468661773, "grad_norm": 4.761525630950928, "learning_rate": 1.7899491812535293e-05, "loss": 2.5239, "step": 2790 }, { "epoch": 0.5270092226613966, "grad_norm": 10.941164016723633, "learning_rate": 1.7891963109354416e-05, "loss": 2.5527, "step": 2800 }, { "epoch": 0.5288913984566158, "grad_norm": 8.885186195373535, "learning_rate": 1.788443440617354e-05, "loss": 2.5902, "step": 2810 }, { "epoch": 0.5307735742518351, "grad_norm": 6.169194221496582, "learning_rate": 1.7876905702992662e-05, "loss": 2.6758, "step": 2820 }, { "epoch": 0.5326557500470543, "grad_norm": 11.396913528442383, "learning_rate": 1.7869376999811785e-05, "loss": 2.6403, "step": 2830 }, { "epoch": 0.5345379258422737, "grad_norm": 6.287395477294922, "learning_rate": 1.7861848296630905e-05, "loss": 2.5786, "step": 2840 }, { "epoch": 0.536420101637493, "grad_norm": 4.667734146118164, "learning_rate": 1.7854319593450027e-05, "loss": 2.4285, "step": 2850 }, { "epoch": 0.5383022774327122, "grad_norm": 7.646928787231445, "learning_rate": 1.7846790890269154e-05, "loss": 2.3599, "step": 2860 }, { "epoch": 0.5401844532279315, "grad_norm": 7.949150085449219, "learning_rate": 1.7839262187088277e-05, "loss": 2.4367, "step": 2870 }, { "epoch": 0.5420666290231507, "grad_norm": 23.867076873779297, "learning_rate": 1.78317334839074e-05, "loss": 2.3823, "step": 2880 }, { "epoch": 0.54394880481837, "grad_norm": 5.78860330581665, "learning_rate": 1.7824204780726523e-05, "loss": 2.5005, "step": 2890 }, { "epoch": 0.5458309806135893, "grad_norm": 10.566971778869629, "learning_rate": 1.7816676077545645e-05, "loss": 2.4836, "step": 2900 }, { "epoch": 0.5477131564088086, "grad_norm": 4.010104656219482, "learning_rate": 1.780914737436477e-05, "loss": 2.3889, "step": 2910 }, { "epoch": 0.5495953322040279, "grad_norm": 10.083136558532715, "learning_rate": 1.7801618671183888e-05, "loss": 2.4056, "step": 2920 }, { "epoch": 0.5514775079992471, "grad_norm": 4.860154151916504, "learning_rate": 1.779408996800301e-05, "loss": 2.5066, "step": 2930 }, { "epoch": 0.5533596837944664, "grad_norm": 6.0354905128479, "learning_rate": 1.7786561264822134e-05, "loss": 2.5239, "step": 2940 }, { "epoch": 0.5552418595896856, "grad_norm": 5.251504898071289, "learning_rate": 1.777903256164126e-05, "loss": 2.4828, "step": 2950 }, { "epoch": 0.557124035384905, "grad_norm": 10.071287155151367, "learning_rate": 1.7771503858460383e-05, "loss": 2.6557, "step": 2960 }, { "epoch": 0.5590062111801242, "grad_norm": 22.186641693115234, "learning_rate": 1.7763975155279506e-05, "loss": 2.4674, "step": 2970 }, { "epoch": 0.5608883869753435, "grad_norm": 7.2353291511535645, "learning_rate": 1.775644645209863e-05, "loss": 2.4926, "step": 2980 }, { "epoch": 0.5627705627705628, "grad_norm": 7.9539794921875, "learning_rate": 1.7748917748917752e-05, "loss": 2.5513, "step": 2990 }, { "epoch": 0.564652738565782, "grad_norm": 2.906517267227173, "learning_rate": 1.7741389045736875e-05, "loss": 2.5281, "step": 3000 }, { "epoch": 0.5665349143610013, "grad_norm": 6.161061763763428, "learning_rate": 1.7733860342555994e-05, "loss": 2.3144, "step": 3010 }, { "epoch": 0.5684170901562206, "grad_norm": 6.098898410797119, "learning_rate": 1.7726331639375117e-05, "loss": 2.4401, "step": 3020 }, { "epoch": 0.5702992659514399, "grad_norm": 6.252312183380127, "learning_rate": 1.771880293619424e-05, "loss": 2.3486, "step": 3030 }, { "epoch": 0.5721814417466592, "grad_norm": 5.018267631530762, "learning_rate": 1.7711274233013366e-05, "loss": 2.5058, "step": 3040 }, { "epoch": 0.5740636175418784, "grad_norm": 4.869485378265381, "learning_rate": 1.770374552983249e-05, "loss": 2.4141, "step": 3050 }, { "epoch": 0.5759457933370977, "grad_norm": 9.376824378967285, "learning_rate": 1.7696216826651612e-05, "loss": 2.3761, "step": 3060 }, { "epoch": 0.5778279691323169, "grad_norm": 9.775004386901855, "learning_rate": 1.7688688123470735e-05, "loss": 2.6541, "step": 3070 }, { "epoch": 0.5797101449275363, "grad_norm": 5.287901401519775, "learning_rate": 1.7681159420289858e-05, "loss": 2.3716, "step": 3080 }, { "epoch": 0.5815923207227555, "grad_norm": 4.416796684265137, "learning_rate": 1.767363071710898e-05, "loss": 2.2715, "step": 3090 }, { "epoch": 0.5834744965179748, "grad_norm": 7.947756767272949, "learning_rate": 1.76661020139281e-05, "loss": 2.3447, "step": 3100 }, { "epoch": 0.5853566723131941, "grad_norm": 8.54676342010498, "learning_rate": 1.7658573310747223e-05, "loss": 2.4445, "step": 3110 }, { "epoch": 0.5872388481084133, "grad_norm": 9.31558609008789, "learning_rate": 1.7651044607566346e-05, "loss": 2.3065, "step": 3120 }, { "epoch": 0.5891210239036326, "grad_norm": 6.117997646331787, "learning_rate": 1.764351590438547e-05, "loss": 2.3316, "step": 3130 }, { "epoch": 0.5910031996988518, "grad_norm": 6.215322017669678, "learning_rate": 1.7635987201204596e-05, "loss": 2.286, "step": 3140 }, { "epoch": 0.5928853754940712, "grad_norm": 7.536943435668945, "learning_rate": 1.762845849802372e-05, "loss": 2.6575, "step": 3150 }, { "epoch": 0.5947675512892904, "grad_norm": 9.97204875946045, "learning_rate": 1.762092979484284e-05, "loss": 2.2224, "step": 3160 }, { "epoch": 0.5966497270845097, "grad_norm": 11.99947452545166, "learning_rate": 1.7613401091661964e-05, "loss": 2.5265, "step": 3170 }, { "epoch": 0.598531902879729, "grad_norm": 5.427422046661377, "learning_rate": 1.7605872388481084e-05, "loss": 2.2315, "step": 3180 }, { "epoch": 0.6004140786749482, "grad_norm": 4.574037551879883, "learning_rate": 1.7598343685300207e-05, "loss": 2.372, "step": 3190 }, { "epoch": 0.6022962544701675, "grad_norm": 7.115758419036865, "learning_rate": 1.759081498211933e-05, "loss": 2.5758, "step": 3200 }, { "epoch": 0.6041784302653868, "grad_norm": 8.453811645507812, "learning_rate": 1.7583286278938453e-05, "loss": 2.1461, "step": 3210 }, { "epoch": 0.6060606060606061, "grad_norm": 8.490642547607422, "learning_rate": 1.7575757575757576e-05, "loss": 2.3582, "step": 3220 }, { "epoch": 0.6079427818558253, "grad_norm": 19.479263305664062, "learning_rate": 1.7568228872576702e-05, "loss": 2.2744, "step": 3230 }, { "epoch": 0.6098249576510446, "grad_norm": 14.469964981079102, "learning_rate": 1.7560700169395825e-05, "loss": 2.4447, "step": 3240 }, { "epoch": 0.6117071334462639, "grad_norm": 3.8202433586120605, "learning_rate": 1.7553171466214948e-05, "loss": 2.367, "step": 3250 }, { "epoch": 0.6135893092414831, "grad_norm": 5.609041213989258, "learning_rate": 1.754564276303407e-05, "loss": 2.4837, "step": 3260 }, { "epoch": 0.6154714850367025, "grad_norm": 4.065234661102295, "learning_rate": 1.753811405985319e-05, "loss": 2.3212, "step": 3270 }, { "epoch": 0.6173536608319217, "grad_norm": 8.558253288269043, "learning_rate": 1.7530585356672313e-05, "loss": 2.2335, "step": 3280 }, { "epoch": 0.619235836627141, "grad_norm": 6.554466247558594, "learning_rate": 1.7523056653491436e-05, "loss": 2.1559, "step": 3290 }, { "epoch": 0.6211180124223602, "grad_norm": 6.196263790130615, "learning_rate": 1.751552795031056e-05, "loss": 2.3493, "step": 3300 }, { "epoch": 0.6230001882175795, "grad_norm": 14.270978927612305, "learning_rate": 1.7507999247129682e-05, "loss": 2.2956, "step": 3310 }, { "epoch": 0.6248823640127988, "grad_norm": 22.012874603271484, "learning_rate": 1.7500470543948808e-05, "loss": 2.3193, "step": 3320 }, { "epoch": 0.626764539808018, "grad_norm": 6.940454959869385, "learning_rate": 1.749294184076793e-05, "loss": 2.2292, "step": 3330 }, { "epoch": 0.6286467156032374, "grad_norm": 5.154090881347656, "learning_rate": 1.7485413137587054e-05, "loss": 2.3149, "step": 3340 }, { "epoch": 0.6305288913984566, "grad_norm": 4.400321006774902, "learning_rate": 1.7477884434406177e-05, "loss": 2.4335, "step": 3350 }, { "epoch": 0.6324110671936759, "grad_norm": 8.82219123840332, "learning_rate": 1.7470355731225296e-05, "loss": 2.1911, "step": 3360 }, { "epoch": 0.6342932429888951, "grad_norm": 7.568221569061279, "learning_rate": 1.746282702804442e-05, "loss": 2.3591, "step": 3370 }, { "epoch": 0.6361754187841144, "grad_norm": 13.639214515686035, "learning_rate": 1.7455298324863542e-05, "loss": 2.3107, "step": 3380 }, { "epoch": 0.6380575945793338, "grad_norm": 4.316806793212891, "learning_rate": 1.7447769621682665e-05, "loss": 2.1789, "step": 3390 }, { "epoch": 0.639939770374553, "grad_norm": 4.804785251617432, "learning_rate": 1.7440240918501788e-05, "loss": 2.1812, "step": 3400 }, { "epoch": 0.6418219461697723, "grad_norm": 8.034902572631836, "learning_rate": 1.743271221532091e-05, "loss": 2.2914, "step": 3410 }, { "epoch": 0.6437041219649915, "grad_norm": 19.507585525512695, "learning_rate": 1.7425183512140037e-05, "loss": 2.3718, "step": 3420 }, { "epoch": 0.6455862977602108, "grad_norm": 13.059826850891113, "learning_rate": 1.741765480895916e-05, "loss": 2.2965, "step": 3430 }, { "epoch": 0.64746847355543, "grad_norm": 5.1344499588012695, "learning_rate": 1.7410126105778283e-05, "loss": 2.2073, "step": 3440 }, { "epoch": 0.6493506493506493, "grad_norm": 7.513482570648193, "learning_rate": 1.7402597402597403e-05, "loss": 2.4255, "step": 3450 }, { "epoch": 0.6512328251458687, "grad_norm": 4.743190288543701, "learning_rate": 1.7395068699416526e-05, "loss": 2.1949, "step": 3460 }, { "epoch": 0.6531150009410879, "grad_norm": 7.261030197143555, "learning_rate": 1.738753999623565e-05, "loss": 2.4901, "step": 3470 }, { "epoch": 0.6549971767363072, "grad_norm": 5.6540093421936035, "learning_rate": 1.738001129305477e-05, "loss": 2.3915, "step": 3480 }, { "epoch": 0.6568793525315264, "grad_norm": 6.778534889221191, "learning_rate": 1.7372482589873894e-05, "loss": 2.2246, "step": 3490 }, { "epoch": 0.6587615283267457, "grad_norm": 7.759307384490967, "learning_rate": 1.7364953886693017e-05, "loss": 2.0732, "step": 3500 }, { "epoch": 0.6606437041219649, "grad_norm": 16.40176010131836, "learning_rate": 1.7357425183512144e-05, "loss": 2.2766, "step": 3510 }, { "epoch": 0.6625258799171843, "grad_norm": 7.301122188568115, "learning_rate": 1.7349896480331267e-05, "loss": 2.1197, "step": 3520 }, { "epoch": 0.6644080557124036, "grad_norm": 5.543787479400635, "learning_rate": 1.7342367777150386e-05, "loss": 2.2689, "step": 3530 }, { "epoch": 0.6662902315076228, "grad_norm": 4.671676158905029, "learning_rate": 1.733483907396951e-05, "loss": 2.2419, "step": 3540 }, { "epoch": 0.6681724073028421, "grad_norm": 6.969106197357178, "learning_rate": 1.7327310370788632e-05, "loss": 2.127, "step": 3550 }, { "epoch": 0.6700545830980613, "grad_norm": 7.427960395812988, "learning_rate": 1.7319781667607755e-05, "loss": 2.3973, "step": 3560 }, { "epoch": 0.6719367588932806, "grad_norm": 4.417695999145508, "learning_rate": 1.7312252964426878e-05, "loss": 2.0037, "step": 3570 }, { "epoch": 0.6738189346885, "grad_norm": 6.873748779296875, "learning_rate": 1.7304724261246e-05, "loss": 2.0635, "step": 3580 }, { "epoch": 0.6757011104837192, "grad_norm": 4.036855220794678, "learning_rate": 1.7297195558065124e-05, "loss": 2.2912, "step": 3590 }, { "epoch": 0.6775832862789385, "grad_norm": 6.932463645935059, "learning_rate": 1.728966685488425e-05, "loss": 2.1217, "step": 3600 }, { "epoch": 0.6794654620741577, "grad_norm": 15.238147735595703, "learning_rate": 1.7282138151703373e-05, "loss": 2.2328, "step": 3610 }, { "epoch": 0.681347637869377, "grad_norm": 4.133694648742676, "learning_rate": 1.7274609448522492e-05, "loss": 2.0326, "step": 3620 }, { "epoch": 0.6832298136645962, "grad_norm": 5.9729766845703125, "learning_rate": 1.7267080745341615e-05, "loss": 2.2155, "step": 3630 }, { "epoch": 0.6851119894598156, "grad_norm": 13.098869323730469, "learning_rate": 1.7259552042160738e-05, "loss": 2.267, "step": 3640 }, { "epoch": 0.6869941652550349, "grad_norm": 3.980104684829712, "learning_rate": 1.725202333897986e-05, "loss": 2.0865, "step": 3650 }, { "epoch": 0.6888763410502541, "grad_norm": 3.8054277896881104, "learning_rate": 1.7244494635798984e-05, "loss": 2.079, "step": 3660 }, { "epoch": 0.6907585168454734, "grad_norm": 6.999955654144287, "learning_rate": 1.7236965932618107e-05, "loss": 2.1672, "step": 3670 }, { "epoch": 0.6926406926406926, "grad_norm": 11.031990051269531, "learning_rate": 1.722943722943723e-05, "loss": 2.1624, "step": 3680 }, { "epoch": 0.6945228684359119, "grad_norm": 8.7633056640625, "learning_rate": 1.7221908526256356e-05, "loss": 2.227, "step": 3690 }, { "epoch": 0.6964050442311311, "grad_norm": 7.400768280029297, "learning_rate": 1.721437982307548e-05, "loss": 2.1331, "step": 3700 }, { "epoch": 0.6982872200263505, "grad_norm": 4.83717155456543, "learning_rate": 1.72068511198946e-05, "loss": 2.049, "step": 3710 }, { "epoch": 0.7001693958215698, "grad_norm": 9.554513931274414, "learning_rate": 1.719932241671372e-05, "loss": 2.1901, "step": 3720 }, { "epoch": 0.702051571616789, "grad_norm": 6.765610694885254, "learning_rate": 1.7191793713532845e-05, "loss": 2.0239, "step": 3730 }, { "epoch": 0.7039337474120083, "grad_norm": 9.109612464904785, "learning_rate": 1.7184265010351967e-05, "loss": 2.3195, "step": 3740 }, { "epoch": 0.7058159232072275, "grad_norm": 14.373083114624023, "learning_rate": 1.717673630717109e-05, "loss": 2.2486, "step": 3750 }, { "epoch": 0.7076980990024468, "grad_norm": 8.647749900817871, "learning_rate": 1.7169207603990213e-05, "loss": 2.0991, "step": 3760 }, { "epoch": 0.709580274797666, "grad_norm": 7.751832008361816, "learning_rate": 1.7161678900809336e-05, "loss": 2.0374, "step": 3770 }, { "epoch": 0.7114624505928854, "grad_norm": 21.32139015197754, "learning_rate": 1.715415019762846e-05, "loss": 2.0808, "step": 3780 }, { "epoch": 0.7133446263881047, "grad_norm": 9.653078079223633, "learning_rate": 1.7146621494447582e-05, "loss": 2.0081, "step": 3790 }, { "epoch": 0.7152268021833239, "grad_norm": 2.955333948135376, "learning_rate": 1.7139092791266705e-05, "loss": 2.2522, "step": 3800 }, { "epoch": 0.7171089779785432, "grad_norm": 19.1168212890625, "learning_rate": 1.7131564088085828e-05, "loss": 2.2064, "step": 3810 }, { "epoch": 0.7189911537737624, "grad_norm": 9.711030960083008, "learning_rate": 1.712403538490495e-05, "loss": 1.9549, "step": 3820 }, { "epoch": 0.7208733295689818, "grad_norm": 15.813447952270508, "learning_rate": 1.7116506681724074e-05, "loss": 2.1709, "step": 3830 }, { "epoch": 0.722755505364201, "grad_norm": 11.953786849975586, "learning_rate": 1.7108977978543197e-05, "loss": 1.9825, "step": 3840 }, { "epoch": 0.7246376811594203, "grad_norm": 6.5752034187316895, "learning_rate": 1.710144927536232e-05, "loss": 2.1801, "step": 3850 }, { "epoch": 0.7265198569546396, "grad_norm": 10.277215957641602, "learning_rate": 1.7093920572181443e-05, "loss": 2.0362, "step": 3860 }, { "epoch": 0.7284020327498588, "grad_norm": 12.632518768310547, "learning_rate": 1.7086391869000565e-05, "loss": 2.1342, "step": 3870 }, { "epoch": 0.7302842085450781, "grad_norm": 8.751096725463867, "learning_rate": 1.707886316581969e-05, "loss": 1.9903, "step": 3880 }, { "epoch": 0.7321663843402973, "grad_norm": 6.152645587921143, "learning_rate": 1.707133446263881e-05, "loss": 2.1692, "step": 3890 }, { "epoch": 0.7340485601355167, "grad_norm": 5.939425945281982, "learning_rate": 1.7063805759457934e-05, "loss": 1.8733, "step": 3900 }, { "epoch": 0.7359307359307359, "grad_norm": 13.272156715393066, "learning_rate": 1.7056277056277057e-05, "loss": 2.1915, "step": 3910 }, { "epoch": 0.7378129117259552, "grad_norm": 4.635878086090088, "learning_rate": 1.704874835309618e-05, "loss": 2.0686, "step": 3920 }, { "epoch": 0.7396950875211745, "grad_norm": 15.71611499786377, "learning_rate": 1.7041219649915303e-05, "loss": 1.9865, "step": 3930 }, { "epoch": 0.7415772633163937, "grad_norm": 6.550897121429443, "learning_rate": 1.7033690946734426e-05, "loss": 1.9559, "step": 3940 }, { "epoch": 0.743459439111613, "grad_norm": 8.063326835632324, "learning_rate": 1.702616224355355e-05, "loss": 2.2405, "step": 3950 }, { "epoch": 0.7453416149068323, "grad_norm": 8.72902774810791, "learning_rate": 1.7018633540372672e-05, "loss": 1.9724, "step": 3960 }, { "epoch": 0.7472237907020516, "grad_norm": 10.127591133117676, "learning_rate": 1.7011104837191795e-05, "loss": 2.0511, "step": 3970 }, { "epoch": 0.7491059664972708, "grad_norm": 16.748136520385742, "learning_rate": 1.7003576134010918e-05, "loss": 2.0714, "step": 3980 }, { "epoch": 0.7509881422924901, "grad_norm": 11.505964279174805, "learning_rate": 1.699604743083004e-05, "loss": 2.0682, "step": 3990 }, { "epoch": 0.7528703180877094, "grad_norm": 12.179784774780273, "learning_rate": 1.6988518727649163e-05, "loss": 2.0715, "step": 4000 }, { "epoch": 0.7547524938829286, "grad_norm": 5.324981212615967, "learning_rate": 1.6980990024468286e-05, "loss": 2.0796, "step": 4010 }, { "epoch": 0.756634669678148, "grad_norm": 6.018923759460449, "learning_rate": 1.697346132128741e-05, "loss": 1.9147, "step": 4020 }, { "epoch": 0.7585168454733672, "grad_norm": 5.128754138946533, "learning_rate": 1.6965932618106532e-05, "loss": 2.2234, "step": 4030 }, { "epoch": 0.7603990212685865, "grad_norm": 7.023144721984863, "learning_rate": 1.6958403914925655e-05, "loss": 1.9784, "step": 4040 }, { "epoch": 0.7622811970638057, "grad_norm": 5.513296127319336, "learning_rate": 1.6950875211744778e-05, "loss": 1.8725, "step": 4050 }, { "epoch": 0.764163372859025, "grad_norm": 5.376245498657227, "learning_rate": 1.69433465085639e-05, "loss": 1.9038, "step": 4060 }, { "epoch": 0.7660455486542443, "grad_norm": 4.453749179840088, "learning_rate": 1.6935817805383024e-05, "loss": 2.0901, "step": 4070 }, { "epoch": 0.7679277244494636, "grad_norm": 6.467559814453125, "learning_rate": 1.6928289102202147e-05, "loss": 1.9841, "step": 4080 }, { "epoch": 0.7698099002446829, "grad_norm": 5.831012725830078, "learning_rate": 1.692076039902127e-05, "loss": 1.7553, "step": 4090 }, { "epoch": 0.7716920760399021, "grad_norm": 8.568511009216309, "learning_rate": 1.6913231695840393e-05, "loss": 2.0252, "step": 4100 }, { "epoch": 0.7735742518351214, "grad_norm": 6.630292892456055, "learning_rate": 1.6905702992659516e-05, "loss": 2.0024, "step": 4110 }, { "epoch": 0.7754564276303407, "grad_norm": 5.808889389038086, "learning_rate": 1.689817428947864e-05, "loss": 1.7828, "step": 4120 }, { "epoch": 0.7773386034255599, "grad_norm": 4.167357444763184, "learning_rate": 1.689064558629776e-05, "loss": 1.7878, "step": 4130 }, { "epoch": 0.7792207792207793, "grad_norm": 16.51450538635254, "learning_rate": 1.6883116883116884e-05, "loss": 2.0933, "step": 4140 }, { "epoch": 0.7811029550159985, "grad_norm": 7.76092004776001, "learning_rate": 1.6875588179936007e-05, "loss": 1.8682, "step": 4150 }, { "epoch": 0.7829851308112178, "grad_norm": 3.58233904838562, "learning_rate": 1.686805947675513e-05, "loss": 2.0892, "step": 4160 }, { "epoch": 0.784867306606437, "grad_norm": 4.204033851623535, "learning_rate": 1.6860530773574253e-05, "loss": 2.132, "step": 4170 }, { "epoch": 0.7867494824016563, "grad_norm": 4.688413619995117, "learning_rate": 1.6853002070393376e-05, "loss": 1.8572, "step": 4180 }, { "epoch": 0.7886316581968756, "grad_norm": 10.84228515625, "learning_rate": 1.68454733672125e-05, "loss": 2.0358, "step": 4190 }, { "epoch": 0.7905138339920948, "grad_norm": 6.188892841339111, "learning_rate": 1.6837944664031622e-05, "loss": 1.8994, "step": 4200 }, { "epoch": 0.7923960097873142, "grad_norm": 14.771183013916016, "learning_rate": 1.6830415960850745e-05, "loss": 2.006, "step": 4210 }, { "epoch": 0.7942781855825334, "grad_norm": 6.336653709411621, "learning_rate": 1.6822887257669868e-05, "loss": 1.9038, "step": 4220 }, { "epoch": 0.7961603613777527, "grad_norm": 7.478043079376221, "learning_rate": 1.681535855448899e-05, "loss": 1.9509, "step": 4230 }, { "epoch": 0.7980425371729719, "grad_norm": 12.106429100036621, "learning_rate": 1.6807829851308114e-05, "loss": 1.8751, "step": 4240 }, { "epoch": 0.7999247129681912, "grad_norm": 8.335380554199219, "learning_rate": 1.6800301148127236e-05, "loss": 2.1869, "step": 4250 }, { "epoch": 0.8018068887634106, "grad_norm": 9.204015731811523, "learning_rate": 1.679277244494636e-05, "loss": 2.2222, "step": 4260 }, { "epoch": 0.8036890645586298, "grad_norm": 5.910819053649902, "learning_rate": 1.6785243741765482e-05, "loss": 1.7396, "step": 4270 }, { "epoch": 0.8055712403538491, "grad_norm": 12.734546661376953, "learning_rate": 1.6777715038584605e-05, "loss": 2.0184, "step": 4280 }, { "epoch": 0.8074534161490683, "grad_norm": 8.117308616638184, "learning_rate": 1.6770186335403728e-05, "loss": 2.1315, "step": 4290 }, { "epoch": 0.8093355919442876, "grad_norm": 6.764369010925293, "learning_rate": 1.676265763222285e-05, "loss": 1.7082, "step": 4300 }, { "epoch": 0.8112177677395068, "grad_norm": 11.162191390991211, "learning_rate": 1.6755128929041974e-05, "loss": 1.8846, "step": 4310 }, { "epoch": 0.8130999435347261, "grad_norm": 7.554187297821045, "learning_rate": 1.6747600225861097e-05, "loss": 1.7372, "step": 4320 }, { "epoch": 0.8149821193299455, "grad_norm": 6.000895977020264, "learning_rate": 1.674007152268022e-05, "loss": 2.1344, "step": 4330 }, { "epoch": 0.8168642951251647, "grad_norm": 6.909875869750977, "learning_rate": 1.6732542819499343e-05, "loss": 1.7821, "step": 4340 }, { "epoch": 0.818746470920384, "grad_norm": 9.617115020751953, "learning_rate": 1.6725014116318466e-05, "loss": 1.8464, "step": 4350 }, { "epoch": 0.8206286467156032, "grad_norm": 3.998657703399658, "learning_rate": 1.671748541313759e-05, "loss": 1.6707, "step": 4360 }, { "epoch": 0.8225108225108225, "grad_norm": 18.605613708496094, "learning_rate": 1.670995670995671e-05, "loss": 1.9964, "step": 4370 }, { "epoch": 0.8243929983060417, "grad_norm": 21.7368221282959, "learning_rate": 1.6702428006775834e-05, "loss": 1.7039, "step": 4380 }, { "epoch": 0.8262751741012611, "grad_norm": 10.246116638183594, "learning_rate": 1.6694899303594957e-05, "loss": 1.8365, "step": 4390 }, { "epoch": 0.8281573498964804, "grad_norm": 12.197159767150879, "learning_rate": 1.668737060041408e-05, "loss": 2.0829, "step": 4400 }, { "epoch": 0.8300395256916996, "grad_norm": 7.530276298522949, "learning_rate": 1.6679841897233203e-05, "loss": 1.8986, "step": 4410 }, { "epoch": 0.8319217014869189, "grad_norm": 9.89567756652832, "learning_rate": 1.6672313194052326e-05, "loss": 1.9985, "step": 4420 }, { "epoch": 0.8338038772821381, "grad_norm": 10.721818923950195, "learning_rate": 1.666478449087145e-05, "loss": 1.8921, "step": 4430 }, { "epoch": 0.8356860530773574, "grad_norm": 12.71999740600586, "learning_rate": 1.6657255787690572e-05, "loss": 1.8283, "step": 4440 }, { "epoch": 0.8375682288725766, "grad_norm": 8.337474822998047, "learning_rate": 1.6649727084509695e-05, "loss": 1.892, "step": 4450 }, { "epoch": 0.839450404667796, "grad_norm": 12.982160568237305, "learning_rate": 1.6642198381328818e-05, "loss": 1.9852, "step": 4460 }, { "epoch": 0.8413325804630153, "grad_norm": 7.9721784591674805, "learning_rate": 1.663466967814794e-05, "loss": 2.096, "step": 4470 }, { "epoch": 0.8432147562582345, "grad_norm": 13.718586921691895, "learning_rate": 1.6627140974967064e-05, "loss": 1.9495, "step": 4480 }, { "epoch": 0.8450969320534538, "grad_norm": 6.098572731018066, "learning_rate": 1.6619612271786187e-05, "loss": 1.5641, "step": 4490 }, { "epoch": 0.846979107848673, "grad_norm": 5.264159202575684, "learning_rate": 1.661208356860531e-05, "loss": 1.8201, "step": 4500 }, { "epoch": 0.8488612836438924, "grad_norm": 15.892148971557617, "learning_rate": 1.6604554865424432e-05, "loss": 1.8025, "step": 4510 }, { "epoch": 0.8507434594391116, "grad_norm": 19.255720138549805, "learning_rate": 1.6597026162243555e-05, "loss": 1.8699, "step": 4520 }, { "epoch": 0.8526256352343309, "grad_norm": 12.752776145935059, "learning_rate": 1.6589497459062678e-05, "loss": 1.9059, "step": 4530 }, { "epoch": 0.8545078110295502, "grad_norm": 24.916339874267578, "learning_rate": 1.65819687558818e-05, "loss": 1.8327, "step": 4540 }, { "epoch": 0.8563899868247694, "grad_norm": 16.435821533203125, "learning_rate": 1.6574440052700924e-05, "loss": 1.907, "step": 4550 }, { "epoch": 0.8582721626199887, "grad_norm": 8.208995819091797, "learning_rate": 1.6566911349520047e-05, "loss": 1.7911, "step": 4560 }, { "epoch": 0.8601543384152079, "grad_norm": 19.37531089782715, "learning_rate": 1.655938264633917e-05, "loss": 2.0121, "step": 4570 }, { "epoch": 0.8620365142104273, "grad_norm": 9.35920524597168, "learning_rate": 1.6551853943158293e-05, "loss": 1.8435, "step": 4580 }, { "epoch": 0.8639186900056465, "grad_norm": 19.906599044799805, "learning_rate": 1.6544325239977416e-05, "loss": 2.0695, "step": 4590 }, { "epoch": 0.8658008658008658, "grad_norm": 6.58824348449707, "learning_rate": 1.653679653679654e-05, "loss": 1.8859, "step": 4600 }, { "epoch": 0.8676830415960851, "grad_norm": 7.258162975311279, "learning_rate": 1.652926783361566e-05, "loss": 1.8014, "step": 4610 }, { "epoch": 0.8695652173913043, "grad_norm": 10.79643726348877, "learning_rate": 1.6521739130434785e-05, "loss": 1.9955, "step": 4620 }, { "epoch": 0.8714473931865236, "grad_norm": 8.076590538024902, "learning_rate": 1.6514210427253907e-05, "loss": 1.9565, "step": 4630 }, { "epoch": 0.8733295689817429, "grad_norm": 34.28034591674805, "learning_rate": 1.650668172407303e-05, "loss": 1.712, "step": 4640 }, { "epoch": 0.8752117447769622, "grad_norm": 13.998528480529785, "learning_rate": 1.6499153020892153e-05, "loss": 1.9953, "step": 4650 }, { "epoch": 0.8770939205721815, "grad_norm": 11.216374397277832, "learning_rate": 1.6491624317711273e-05, "loss": 1.7193, "step": 4660 }, { "epoch": 0.8789760963674007, "grad_norm": 18.6248722076416, "learning_rate": 1.64840956145304e-05, "loss": 1.6458, "step": 4670 }, { "epoch": 0.88085827216262, "grad_norm": 4.484902858734131, "learning_rate": 1.6476566911349522e-05, "loss": 1.8185, "step": 4680 }, { "epoch": 0.8827404479578392, "grad_norm": 20.6184024810791, "learning_rate": 1.6469038208168645e-05, "loss": 1.8833, "step": 4690 }, { "epoch": 0.8846226237530586, "grad_norm": 10.890295028686523, "learning_rate": 1.6461509504987768e-05, "loss": 1.7697, "step": 4700 }, { "epoch": 0.8865047995482778, "grad_norm": 10.369776725769043, "learning_rate": 1.645398080180689e-05, "loss": 1.8408, "step": 4710 }, { "epoch": 0.8883869753434971, "grad_norm": 6.13337516784668, "learning_rate": 1.6446452098626014e-05, "loss": 1.9334, "step": 4720 }, { "epoch": 0.8902691511387164, "grad_norm": 7.776824951171875, "learning_rate": 1.6438923395445137e-05, "loss": 1.6361, "step": 4730 }, { "epoch": 0.8921513269339356, "grad_norm": 16.185203552246094, "learning_rate": 1.643139469226426e-05, "loss": 1.6398, "step": 4740 }, { "epoch": 0.8940335027291549, "grad_norm": 16.69852638244629, "learning_rate": 1.642386598908338e-05, "loss": 1.8242, "step": 4750 }, { "epoch": 0.8959156785243741, "grad_norm": 8.41193675994873, "learning_rate": 1.6416337285902505e-05, "loss": 1.6251, "step": 4760 }, { "epoch": 0.8977978543195935, "grad_norm": 7.705557823181152, "learning_rate": 1.640880858272163e-05, "loss": 1.9138, "step": 4770 }, { "epoch": 0.8996800301148127, "grad_norm": 6.7078537940979, "learning_rate": 1.640127987954075e-05, "loss": 1.8573, "step": 4780 }, { "epoch": 0.901562205910032, "grad_norm": 9.067506790161133, "learning_rate": 1.6393751176359874e-05, "loss": 1.7235, "step": 4790 }, { "epoch": 0.9034443817052513, "grad_norm": 6.557106971740723, "learning_rate": 1.6386222473178997e-05, "loss": 1.5589, "step": 4800 }, { "epoch": 0.9053265575004705, "grad_norm": 18.50225067138672, "learning_rate": 1.637869376999812e-05, "loss": 1.7907, "step": 4810 }, { "epoch": 0.9072087332956899, "grad_norm": 13.424736022949219, "learning_rate": 1.6371165066817243e-05, "loss": 1.8507, "step": 4820 }, { "epoch": 0.9090909090909091, "grad_norm": 6.312253952026367, "learning_rate": 1.6363636363636366e-05, "loss": 1.8001, "step": 4830 }, { "epoch": 0.9109730848861284, "grad_norm": 4.222917556762695, "learning_rate": 1.6356107660455485e-05, "loss": 1.9496, "step": 4840 }, { "epoch": 0.9128552606813476, "grad_norm": 14.68265438079834, "learning_rate": 1.6348578957274612e-05, "loss": 1.7783, "step": 4850 }, { "epoch": 0.9147374364765669, "grad_norm": 7.720460891723633, "learning_rate": 1.6341050254093735e-05, "loss": 1.6416, "step": 4860 }, { "epoch": 0.9166196122717862, "grad_norm": 7.769947052001953, "learning_rate": 1.6333521550912858e-05, "loss": 1.8749, "step": 4870 }, { "epoch": 0.9185017880670054, "grad_norm": 9.666947364807129, "learning_rate": 1.632599284773198e-05, "loss": 1.6327, "step": 4880 }, { "epoch": 0.9203839638622248, "grad_norm": 12.226252555847168, "learning_rate": 1.6318464144551103e-05, "loss": 1.7772, "step": 4890 }, { "epoch": 0.922266139657444, "grad_norm": 21.034391403198242, "learning_rate": 1.6310935441370226e-05, "loss": 1.8364, "step": 4900 }, { "epoch": 0.9241483154526633, "grad_norm": 11.93802261352539, "learning_rate": 1.630340673818935e-05, "loss": 1.8696, "step": 4910 }, { "epoch": 0.9260304912478825, "grad_norm": 16.049116134643555, "learning_rate": 1.6295878035008472e-05, "loss": 1.8541, "step": 4920 }, { "epoch": 0.9279126670431018, "grad_norm": 12.634326934814453, "learning_rate": 1.6288349331827592e-05, "loss": 1.6714, "step": 4930 }, { "epoch": 0.9297948428383211, "grad_norm": 17.020570755004883, "learning_rate": 1.6280820628646715e-05, "loss": 1.915, "step": 4940 }, { "epoch": 0.9316770186335404, "grad_norm": 3.1234688758850098, "learning_rate": 1.627329192546584e-05, "loss": 1.4168, "step": 4950 }, { "epoch": 0.9335591944287597, "grad_norm": 14.488174438476562, "learning_rate": 1.6265763222284964e-05, "loss": 1.4905, "step": 4960 }, { "epoch": 0.9354413702239789, "grad_norm": 9.10696029663086, "learning_rate": 1.6258234519104087e-05, "loss": 1.7308, "step": 4970 }, { "epoch": 0.9373235460191982, "grad_norm": 10.455379486083984, "learning_rate": 1.625070581592321e-05, "loss": 1.6388, "step": 4980 }, { "epoch": 0.9392057218144174, "grad_norm": 12.714646339416504, "learning_rate": 1.6243177112742333e-05, "loss": 1.5879, "step": 4990 }, { "epoch": 0.9410878976096367, "grad_norm": 33.10146713256836, "learning_rate": 1.6235648409561456e-05, "loss": 1.4738, "step": 5000 }, { "epoch": 0.9429700734048561, "grad_norm": 5.845310688018799, "learning_rate": 1.6228119706380575e-05, "loss": 1.6773, "step": 5010 }, { "epoch": 0.9448522492000753, "grad_norm": 8.02888298034668, "learning_rate": 1.6220591003199698e-05, "loss": 1.7755, "step": 5020 }, { "epoch": 0.9467344249952946, "grad_norm": 17.18178367614746, "learning_rate": 1.621306230001882e-05, "loss": 1.669, "step": 5030 }, { "epoch": 0.9486166007905138, "grad_norm": 8.009747505187988, "learning_rate": 1.6205533596837947e-05, "loss": 1.6732, "step": 5040 }, { "epoch": 0.9504987765857331, "grad_norm": 7.900938034057617, "learning_rate": 1.619800489365707e-05, "loss": 1.6969, "step": 5050 }, { "epoch": 0.9523809523809523, "grad_norm": 12.25010871887207, "learning_rate": 1.6190476190476193e-05, "loss": 1.8035, "step": 5060 }, { "epoch": 0.9542631281761716, "grad_norm": 7.510868072509766, "learning_rate": 1.6182947487295316e-05, "loss": 1.5132, "step": 5070 }, { "epoch": 0.956145303971391, "grad_norm": 6.639998435974121, "learning_rate": 1.617541878411444e-05, "loss": 1.5381, "step": 5080 }, { "epoch": 0.9580274797666102, "grad_norm": 11.637256622314453, "learning_rate": 1.6167890080933562e-05, "loss": 1.704, "step": 5090 }, { "epoch": 0.9599096555618295, "grad_norm": 8.058938026428223, "learning_rate": 1.616036137775268e-05, "loss": 1.5851, "step": 5100 }, { "epoch": 0.9617918313570487, "grad_norm": 9.28378677368164, "learning_rate": 1.6152832674571804e-05, "loss": 1.6364, "step": 5110 }, { "epoch": 0.963674007152268, "grad_norm": 12.845236778259277, "learning_rate": 1.6145303971390927e-05, "loss": 1.845, "step": 5120 }, { "epoch": 0.9655561829474872, "grad_norm": 10.16936206817627, "learning_rate": 1.6137775268210054e-05, "loss": 1.6366, "step": 5130 }, { "epoch": 0.9674383587427066, "grad_norm": 9.189414024353027, "learning_rate": 1.6130246565029176e-05, "loss": 1.5852, "step": 5140 }, { "epoch": 0.9693205345379259, "grad_norm": 8.392788887023926, "learning_rate": 1.61227178618483e-05, "loss": 1.736, "step": 5150 }, { "epoch": 0.9712027103331451, "grad_norm": 7.551772594451904, "learning_rate": 1.6115189158667422e-05, "loss": 1.742, "step": 5160 }, { "epoch": 0.9730848861283644, "grad_norm": 11.65742015838623, "learning_rate": 1.6107660455486545e-05, "loss": 1.7394, "step": 5170 }, { "epoch": 0.9749670619235836, "grad_norm": 5.885232448577881, "learning_rate": 1.6100131752305668e-05, "loss": 1.6527, "step": 5180 }, { "epoch": 0.9768492377188029, "grad_norm": 9.612196922302246, "learning_rate": 1.6092603049124788e-05, "loss": 1.5318, "step": 5190 }, { "epoch": 0.9787314135140223, "grad_norm": 16.069961547851562, "learning_rate": 1.608507434594391e-05, "loss": 1.5113, "step": 5200 }, { "epoch": 0.9806135893092415, "grad_norm": 15.502707481384277, "learning_rate": 1.6077545642763034e-05, "loss": 1.5743, "step": 5210 }, { "epoch": 0.9824957651044608, "grad_norm": 24.306838989257812, "learning_rate": 1.6070016939582156e-05, "loss": 1.6745, "step": 5220 }, { "epoch": 0.98437794089968, "grad_norm": 4.061778545379639, "learning_rate": 1.6062488236401283e-05, "loss": 1.5577, "step": 5230 }, { "epoch": 0.9862601166948993, "grad_norm": 5.753101825714111, "learning_rate": 1.6054959533220406e-05, "loss": 1.3725, "step": 5240 }, { "epoch": 0.9881422924901185, "grad_norm": 9.790191650390625, "learning_rate": 1.604743083003953e-05, "loss": 1.5817, "step": 5250 }, { "epoch": 0.9900244682853379, "grad_norm": 4.075527191162109, "learning_rate": 1.603990212685865e-05, "loss": 1.4994, "step": 5260 }, { "epoch": 0.9919066440805572, "grad_norm": 2.995372772216797, "learning_rate": 1.603237342367777e-05, "loss": 1.5081, "step": 5270 }, { "epoch": 0.9937888198757764, "grad_norm": 2.866976022720337, "learning_rate": 1.6024844720496894e-05, "loss": 1.7521, "step": 5280 }, { "epoch": 0.9956709956709957, "grad_norm": 13.255521774291992, "learning_rate": 1.6017316017316017e-05, "loss": 1.4685, "step": 5290 }, { "epoch": 0.9975531714662149, "grad_norm": 5.982124328613281, "learning_rate": 1.600978731413514e-05, "loss": 1.5513, "step": 5300 }, { "epoch": 0.9994353472614342, "grad_norm": 16.305452346801758, "learning_rate": 1.6002258610954263e-05, "loss": 1.4469, "step": 5310 }, { "epoch": 1.0, "eval_accuracy": 0.8716, "eval_loss": 1.1871148347854614, "eval_runtime": 143.0155, "eval_samples_per_second": 52.442, "eval_steps_per_second": 6.559, "step": 5313 }, { "epoch": 1.0013175230566536, "grad_norm": 9.596165657043457, "learning_rate": 1.599472990777339e-05, "loss": 1.7778, "step": 5320 }, { "epoch": 1.0031996988518728, "grad_norm": 9.728874206542969, "learning_rate": 1.5987201204592512e-05, "loss": 1.6959, "step": 5330 }, { "epoch": 1.005081874647092, "grad_norm": 11.146066665649414, "learning_rate": 1.5979672501411635e-05, "loss": 1.4466, "step": 5340 }, { "epoch": 1.0069640504423114, "grad_norm": 11.417769432067871, "learning_rate": 1.5972143798230758e-05, "loss": 1.5719, "step": 5350 }, { "epoch": 1.0088462262375306, "grad_norm": 11.120503425598145, "learning_rate": 1.5964615095049877e-05, "loss": 1.6229, "step": 5360 }, { "epoch": 1.0107284020327498, "grad_norm": 24.465999603271484, "learning_rate": 1.5957086391869e-05, "loss": 1.6695, "step": 5370 }, { "epoch": 1.012610577827969, "grad_norm": 6.379744529724121, "learning_rate": 1.5949557688688123e-05, "loss": 1.6745, "step": 5380 }, { "epoch": 1.0144927536231885, "grad_norm": 11.811795234680176, "learning_rate": 1.5942028985507246e-05, "loss": 1.3933, "step": 5390 }, { "epoch": 1.0163749294184077, "grad_norm": 6.720681667327881, "learning_rate": 1.593450028232637e-05, "loss": 1.4849, "step": 5400 }, { "epoch": 1.018257105213627, "grad_norm": 5.741754531860352, "learning_rate": 1.5926971579145495e-05, "loss": 1.5725, "step": 5410 }, { "epoch": 1.0201392810088463, "grad_norm": 15.412458419799805, "learning_rate": 1.5919442875964618e-05, "loss": 1.461, "step": 5420 }, { "epoch": 1.0220214568040655, "grad_norm": 19.5515079498291, "learning_rate": 1.591191417278374e-05, "loss": 1.7264, "step": 5430 }, { "epoch": 1.0239036325992847, "grad_norm": 7.74648904800415, "learning_rate": 1.5904385469602864e-05, "loss": 1.4598, "step": 5440 }, { "epoch": 1.025785808394504, "grad_norm": 3.796482801437378, "learning_rate": 1.5896856766421984e-05, "loss": 1.501, "step": 5450 }, { "epoch": 1.0276679841897234, "grad_norm": 10.214385032653809, "learning_rate": 1.5889328063241107e-05, "loss": 1.5436, "step": 5460 }, { "epoch": 1.0295501599849426, "grad_norm": 9.938682556152344, "learning_rate": 1.588179936006023e-05, "loss": 1.4925, "step": 5470 }, { "epoch": 1.0314323357801618, "grad_norm": 9.411529541015625, "learning_rate": 1.5874270656879352e-05, "loss": 1.3498, "step": 5480 }, { "epoch": 1.0333145115753812, "grad_norm": 5.797966003417969, "learning_rate": 1.5866741953698475e-05, "loss": 1.7169, "step": 5490 }, { "epoch": 1.0351966873706004, "grad_norm": 16.239484786987305, "learning_rate": 1.58592132505176e-05, "loss": 1.6468, "step": 5500 }, { "epoch": 1.0370788631658197, "grad_norm": 7.906765460968018, "learning_rate": 1.5851684547336725e-05, "loss": 1.4129, "step": 5510 }, { "epoch": 1.0389610389610389, "grad_norm": 23.242895126342773, "learning_rate": 1.5844155844155847e-05, "loss": 1.4746, "step": 5520 }, { "epoch": 1.0408432147562583, "grad_norm": 11.383037567138672, "learning_rate": 1.583662714097497e-05, "loss": 1.4018, "step": 5530 }, { "epoch": 1.0427253905514775, "grad_norm": 5.408214092254639, "learning_rate": 1.582909843779409e-05, "loss": 1.2102, "step": 5540 }, { "epoch": 1.0446075663466967, "grad_norm": 4.229701519012451, "learning_rate": 1.5821569734613213e-05, "loss": 1.2008, "step": 5550 }, { "epoch": 1.0464897421419161, "grad_norm": 10.835386276245117, "learning_rate": 1.5814041031432336e-05, "loss": 1.4252, "step": 5560 }, { "epoch": 1.0483719179371354, "grad_norm": 7.582424640655518, "learning_rate": 1.580651232825146e-05, "loss": 1.4494, "step": 5570 }, { "epoch": 1.0502540937323546, "grad_norm": 22.497562408447266, "learning_rate": 1.579898362507058e-05, "loss": 1.6387, "step": 5580 }, { "epoch": 1.0521362695275738, "grad_norm": 3.5851354598999023, "learning_rate": 1.5791454921889705e-05, "loss": 1.2493, "step": 5590 }, { "epoch": 1.0540184453227932, "grad_norm": 13.096941947937012, "learning_rate": 1.578392621870883e-05, "loss": 1.5457, "step": 5600 }, { "epoch": 1.0559006211180124, "grad_norm": 4.804878234863281, "learning_rate": 1.5776397515527954e-05, "loss": 1.5518, "step": 5610 }, { "epoch": 1.0577827969132316, "grad_norm": 7.501936912536621, "learning_rate": 1.5768868812347073e-05, "loss": 1.658, "step": 5620 }, { "epoch": 1.059664972708451, "grad_norm": 8.500528335571289, "learning_rate": 1.5761340109166196e-05, "loss": 1.3127, "step": 5630 }, { "epoch": 1.0615471485036703, "grad_norm": 7.3146443367004395, "learning_rate": 1.575381140598532e-05, "loss": 1.257, "step": 5640 }, { "epoch": 1.0634293242988895, "grad_norm": 10.491667747497559, "learning_rate": 1.5746282702804442e-05, "loss": 1.4083, "step": 5650 }, { "epoch": 1.0653115000941087, "grad_norm": 9.215764999389648, "learning_rate": 1.5738753999623565e-05, "loss": 1.3877, "step": 5660 }, { "epoch": 1.0671936758893281, "grad_norm": 8.09468936920166, "learning_rate": 1.5731225296442688e-05, "loss": 1.2621, "step": 5670 }, { "epoch": 1.0690758516845473, "grad_norm": 26.073856353759766, "learning_rate": 1.572369659326181e-05, "loss": 1.5949, "step": 5680 }, { "epoch": 1.0709580274797665, "grad_norm": 5.2231245040893555, "learning_rate": 1.5716167890080937e-05, "loss": 1.2968, "step": 5690 }, { "epoch": 1.072840203274986, "grad_norm": 3.045153856277466, "learning_rate": 1.570863918690006e-05, "loss": 1.2484, "step": 5700 }, { "epoch": 1.0747223790702052, "grad_norm": 2.576266288757324, "learning_rate": 1.570111048371918e-05, "loss": 1.4398, "step": 5710 }, { "epoch": 1.0766045548654244, "grad_norm": 7.949709892272949, "learning_rate": 1.5693581780538303e-05, "loss": 1.3862, "step": 5720 }, { "epoch": 1.0784867306606438, "grad_norm": 16.988584518432617, "learning_rate": 1.5686053077357425e-05, "loss": 1.5145, "step": 5730 }, { "epoch": 1.080368906455863, "grad_norm": 15.880838394165039, "learning_rate": 1.567852437417655e-05, "loss": 1.3245, "step": 5740 }, { "epoch": 1.0822510822510822, "grad_norm": 14.930453300476074, "learning_rate": 1.567099567099567e-05, "loss": 1.7517, "step": 5750 }, { "epoch": 1.0841332580463015, "grad_norm": 9.984511375427246, "learning_rate": 1.5663466967814794e-05, "loss": 1.688, "step": 5760 }, { "epoch": 1.0860154338415209, "grad_norm": 13.405725479125977, "learning_rate": 1.5655938264633917e-05, "loss": 1.4352, "step": 5770 }, { "epoch": 1.08789760963674, "grad_norm": 14.22274112701416, "learning_rate": 1.5648409561453043e-05, "loss": 1.592, "step": 5780 }, { "epoch": 1.0897797854319593, "grad_norm": 15.278095245361328, "learning_rate": 1.5640880858272166e-05, "loss": 1.5002, "step": 5790 }, { "epoch": 1.0916619612271785, "grad_norm": 15.014613151550293, "learning_rate": 1.5633352155091286e-05, "loss": 1.5084, "step": 5800 }, { "epoch": 1.093544137022398, "grad_norm": 10.296102523803711, "learning_rate": 1.562582345191041e-05, "loss": 1.4295, "step": 5810 }, { "epoch": 1.0954263128176172, "grad_norm": 8.37751579284668, "learning_rate": 1.5618294748729532e-05, "loss": 1.3661, "step": 5820 }, { "epoch": 1.0973084886128364, "grad_norm": 5.940329551696777, "learning_rate": 1.5610766045548655e-05, "loss": 1.437, "step": 5830 }, { "epoch": 1.0991906644080558, "grad_norm": 6.347326755523682, "learning_rate": 1.5603237342367778e-05, "loss": 1.2912, "step": 5840 }, { "epoch": 1.101072840203275, "grad_norm": 7.670511245727539, "learning_rate": 1.55957086391869e-05, "loss": 1.2584, "step": 5850 }, { "epoch": 1.1029550159984942, "grad_norm": 14.886139869689941, "learning_rate": 1.5588179936006023e-05, "loss": 1.5002, "step": 5860 }, { "epoch": 1.1048371917937136, "grad_norm": 9.98166275024414, "learning_rate": 1.5580651232825146e-05, "loss": 1.366, "step": 5870 }, { "epoch": 1.1067193675889329, "grad_norm": 4.7431817054748535, "learning_rate": 1.557312252964427e-05, "loss": 1.5411, "step": 5880 }, { "epoch": 1.108601543384152, "grad_norm": 12.848723411560059, "learning_rate": 1.5565593826463392e-05, "loss": 1.3226, "step": 5890 }, { "epoch": 1.1104837191793713, "grad_norm": 6.75590705871582, "learning_rate": 1.5558065123282515e-05, "loss": 1.4568, "step": 5900 }, { "epoch": 1.1123658949745907, "grad_norm": 8.309553146362305, "learning_rate": 1.5550536420101638e-05, "loss": 1.2237, "step": 5910 }, { "epoch": 1.11424807076981, "grad_norm": 46.17892837524414, "learning_rate": 1.554300771692076e-05, "loss": 1.3477, "step": 5920 }, { "epoch": 1.1161302465650291, "grad_norm": 13.070061683654785, "learning_rate": 1.5535479013739884e-05, "loss": 1.3567, "step": 5930 }, { "epoch": 1.1180124223602483, "grad_norm": 19.65549659729004, "learning_rate": 1.5527950310559007e-05, "loss": 1.2769, "step": 5940 }, { "epoch": 1.1198945981554678, "grad_norm": 6.129930019378662, "learning_rate": 1.552042160737813e-05, "loss": 1.349, "step": 5950 }, { "epoch": 1.121776773950687, "grad_norm": 18.13608169555664, "learning_rate": 1.5512892904197253e-05, "loss": 1.6379, "step": 5960 }, { "epoch": 1.1236589497459062, "grad_norm": 8.193785667419434, "learning_rate": 1.5505364201016376e-05, "loss": 1.3498, "step": 5970 }, { "epoch": 1.1255411255411256, "grad_norm": 6.419458866119385, "learning_rate": 1.54978354978355e-05, "loss": 1.188, "step": 5980 }, { "epoch": 1.1274233013363448, "grad_norm": 8.998268127441406, "learning_rate": 1.549030679465462e-05, "loss": 1.42, "step": 5990 }, { "epoch": 1.129305477131564, "grad_norm": 10.662075996398926, "learning_rate": 1.5482778091473744e-05, "loss": 1.0762, "step": 6000 }, { "epoch": 1.1311876529267835, "grad_norm": 12.713705062866211, "learning_rate": 1.5475249388292867e-05, "loss": 1.2355, "step": 6010 }, { "epoch": 1.1330698287220027, "grad_norm": 7.3867268562316895, "learning_rate": 1.546772068511199e-05, "loss": 1.1358, "step": 6020 }, { "epoch": 1.134952004517222, "grad_norm": 5.125856399536133, "learning_rate": 1.5460191981931113e-05, "loss": 1.3024, "step": 6030 }, { "epoch": 1.136834180312441, "grad_norm": 5.838713645935059, "learning_rate": 1.5452663278750236e-05, "loss": 1.2684, "step": 6040 }, { "epoch": 1.1387163561076605, "grad_norm": 9.813702583312988, "learning_rate": 1.544513457556936e-05, "loss": 1.2061, "step": 6050 }, { "epoch": 1.1405985319028797, "grad_norm": 5.236690998077393, "learning_rate": 1.5437605872388482e-05, "loss": 1.2684, "step": 6060 }, { "epoch": 1.142480707698099, "grad_norm": 14.457210540771484, "learning_rate": 1.5430077169207605e-05, "loss": 1.3883, "step": 6070 }, { "epoch": 1.1443628834933182, "grad_norm": 29.406164169311523, "learning_rate": 1.5422548466026728e-05, "loss": 1.4574, "step": 6080 }, { "epoch": 1.1462450592885376, "grad_norm": 17.454238891601562, "learning_rate": 1.541501976284585e-05, "loss": 1.2196, "step": 6090 }, { "epoch": 1.1481272350837568, "grad_norm": 15.656752586364746, "learning_rate": 1.5407491059664974e-05, "loss": 1.204, "step": 6100 }, { "epoch": 1.150009410878976, "grad_norm": 7.013121128082275, "learning_rate": 1.5399962356484096e-05, "loss": 1.4965, "step": 6110 }, { "epoch": 1.1518915866741954, "grad_norm": 2.7544403076171875, "learning_rate": 1.539243365330322e-05, "loss": 1.2982, "step": 6120 }, { "epoch": 1.1537737624694147, "grad_norm": 3.2940828800201416, "learning_rate": 1.5384904950122342e-05, "loss": 1.2389, "step": 6130 }, { "epoch": 1.1556559382646339, "grad_norm": 25.011899948120117, "learning_rate": 1.5377376246941465e-05, "loss": 1.2476, "step": 6140 }, { "epoch": 1.1575381140598533, "grad_norm": 8.474715232849121, "learning_rate": 1.5369847543760588e-05, "loss": 1.6291, "step": 6150 }, { "epoch": 1.1594202898550725, "grad_norm": 18.783309936523438, "learning_rate": 1.536231884057971e-05, "loss": 1.364, "step": 6160 }, { "epoch": 1.1613024656502917, "grad_norm": 10.150487899780273, "learning_rate": 1.5354790137398834e-05, "loss": 1.3958, "step": 6170 }, { "epoch": 1.163184641445511, "grad_norm": 7.357229232788086, "learning_rate": 1.5347261434217957e-05, "loss": 1.3219, "step": 6180 }, { "epoch": 1.1650668172407304, "grad_norm": 29.95680809020996, "learning_rate": 1.533973273103708e-05, "loss": 1.27, "step": 6190 }, { "epoch": 1.1669489930359496, "grad_norm": 14.412184715270996, "learning_rate": 1.5332204027856203e-05, "loss": 1.5809, "step": 6200 }, { "epoch": 1.1688311688311688, "grad_norm": 18.45595932006836, "learning_rate": 1.5324675324675326e-05, "loss": 1.3501, "step": 6210 }, { "epoch": 1.170713344626388, "grad_norm": 6.867365837097168, "learning_rate": 1.531714662149445e-05, "loss": 1.3181, "step": 6220 }, { "epoch": 1.1725955204216074, "grad_norm": 5.471830368041992, "learning_rate": 1.530961791831357e-05, "loss": 1.1032, "step": 6230 }, { "epoch": 1.1744776962168266, "grad_norm": 9.03870677947998, "learning_rate": 1.5302089215132694e-05, "loss": 1.1249, "step": 6240 }, { "epoch": 1.1763598720120458, "grad_norm": 10.276759147644043, "learning_rate": 1.5294560511951817e-05, "loss": 1.4236, "step": 6250 }, { "epoch": 1.1782420478072653, "grad_norm": 5.121780872344971, "learning_rate": 1.528703180877094e-05, "loss": 1.3727, "step": 6260 }, { "epoch": 1.1801242236024845, "grad_norm": 4.659458637237549, "learning_rate": 1.5279503105590063e-05, "loss": 1.2893, "step": 6270 }, { "epoch": 1.1820063993977037, "grad_norm": 33.238712310791016, "learning_rate": 1.5271974402409186e-05, "loss": 1.2643, "step": 6280 }, { "epoch": 1.1838885751929231, "grad_norm": 13.528379440307617, "learning_rate": 1.526444569922831e-05, "loss": 1.4603, "step": 6290 }, { "epoch": 1.1857707509881423, "grad_norm": 5.262114524841309, "learning_rate": 1.5256916996047434e-05, "loss": 1.0799, "step": 6300 }, { "epoch": 1.1876529267833615, "grad_norm": 7.602123260498047, "learning_rate": 1.5249388292866557e-05, "loss": 1.3151, "step": 6310 }, { "epoch": 1.1895351025785807, "grad_norm": 6.674724578857422, "learning_rate": 1.5241859589685676e-05, "loss": 1.166, "step": 6320 }, { "epoch": 1.1914172783738002, "grad_norm": 8.049056053161621, "learning_rate": 1.52343308865048e-05, "loss": 1.217, "step": 6330 }, { "epoch": 1.1932994541690194, "grad_norm": 12.107414245605469, "learning_rate": 1.5226802183323924e-05, "loss": 1.1538, "step": 6340 }, { "epoch": 1.1951816299642386, "grad_norm": 11.166797637939453, "learning_rate": 1.5219273480143047e-05, "loss": 1.3976, "step": 6350 }, { "epoch": 1.1970638057594578, "grad_norm": 6.7904953956604, "learning_rate": 1.521174477696217e-05, "loss": 1.5883, "step": 6360 }, { "epoch": 1.1989459815546772, "grad_norm": 9.514391899108887, "learning_rate": 1.5204216073781292e-05, "loss": 1.472, "step": 6370 }, { "epoch": 1.2008281573498965, "grad_norm": 29.685026168823242, "learning_rate": 1.5196687370600415e-05, "loss": 1.5798, "step": 6380 }, { "epoch": 1.2027103331451157, "grad_norm": 14.561470985412598, "learning_rate": 1.5189158667419538e-05, "loss": 1.4515, "step": 6390 }, { "epoch": 1.204592508940335, "grad_norm": 19.98004150390625, "learning_rate": 1.5181629964238663e-05, "loss": 1.3567, "step": 6400 }, { "epoch": 1.2064746847355543, "grad_norm": 10.195460319519043, "learning_rate": 1.5174101261057782e-05, "loss": 1.2189, "step": 6410 }, { "epoch": 1.2083568605307735, "grad_norm": 2.810354471206665, "learning_rate": 1.5166572557876907e-05, "loss": 1.4201, "step": 6420 }, { "epoch": 1.210239036325993, "grad_norm": 11.01855182647705, "learning_rate": 1.515904385469603e-05, "loss": 1.1615, "step": 6430 }, { "epoch": 1.2121212121212122, "grad_norm": 14.129963874816895, "learning_rate": 1.5151515151515153e-05, "loss": 1.2612, "step": 6440 }, { "epoch": 1.2140033879164314, "grad_norm": 11.1683349609375, "learning_rate": 1.5143986448334276e-05, "loss": 1.3002, "step": 6450 }, { "epoch": 1.2158855637116506, "grad_norm": 5.463107585906982, "learning_rate": 1.5136457745153399e-05, "loss": 1.2832, "step": 6460 }, { "epoch": 1.21776773950687, "grad_norm": 11.25309944152832, "learning_rate": 1.5128929041972522e-05, "loss": 1.2142, "step": 6470 }, { "epoch": 1.2196499153020892, "grad_norm": 8.699621200561523, "learning_rate": 1.5121400338791645e-05, "loss": 1.0461, "step": 6480 }, { "epoch": 1.2215320910973084, "grad_norm": 16.165090560913086, "learning_rate": 1.5113871635610766e-05, "loss": 1.2469, "step": 6490 }, { "epoch": 1.2234142668925279, "grad_norm": 25.876081466674805, "learning_rate": 1.5106342932429889e-05, "loss": 1.4, "step": 6500 }, { "epoch": 1.225296442687747, "grad_norm": 10.362266540527344, "learning_rate": 1.5098814229249013e-05, "loss": 1.0884, "step": 6510 }, { "epoch": 1.2271786184829663, "grad_norm": 4.939754486083984, "learning_rate": 1.5091285526068136e-05, "loss": 1.5001, "step": 6520 }, { "epoch": 1.2290607942781855, "grad_norm": 8.243157386779785, "learning_rate": 1.5083756822887259e-05, "loss": 1.3486, "step": 6530 }, { "epoch": 1.230942970073405, "grad_norm": 5.043724060058594, "learning_rate": 1.5076228119706382e-05, "loss": 1.1551, "step": 6540 }, { "epoch": 1.2328251458686241, "grad_norm": 5.159477233886719, "learning_rate": 1.5068699416525505e-05, "loss": 1.0274, "step": 6550 }, { "epoch": 1.2347073216638433, "grad_norm": 5.960069179534912, "learning_rate": 1.5061170713344628e-05, "loss": 1.232, "step": 6560 }, { "epoch": 1.2365894974590628, "grad_norm": 5.939128875732422, "learning_rate": 1.505364201016375e-05, "loss": 1.6197, "step": 6570 }, { "epoch": 1.238471673254282, "grad_norm": 13.313936233520508, "learning_rate": 1.5046113306982872e-05, "loss": 1.1685, "step": 6580 }, { "epoch": 1.2403538490495012, "grad_norm": 4.995264053344727, "learning_rate": 1.5038584603801995e-05, "loss": 1.2126, "step": 6590 }, { "epoch": 1.2422360248447206, "grad_norm": 19.458560943603516, "learning_rate": 1.5031055900621118e-05, "loss": 0.9409, "step": 6600 }, { "epoch": 1.2441182006399398, "grad_norm": 11.867140769958496, "learning_rate": 1.5023527197440243e-05, "loss": 1.1042, "step": 6610 }, { "epoch": 1.246000376435159, "grad_norm": 7.560145854949951, "learning_rate": 1.5015998494259365e-05, "loss": 1.399, "step": 6620 }, { "epoch": 1.2478825522303783, "grad_norm": 19.83662986755371, "learning_rate": 1.5008469791078488e-05, "loss": 1.2062, "step": 6630 }, { "epoch": 1.2497647280255977, "grad_norm": 11.589738845825195, "learning_rate": 1.5000941087897611e-05, "loss": 1.2568, "step": 6640 }, { "epoch": 1.251646903820817, "grad_norm": 14.702427864074707, "learning_rate": 1.4993412384716734e-05, "loss": 1.1121, "step": 6650 }, { "epoch": 1.253529079616036, "grad_norm": 10.332629203796387, "learning_rate": 1.4985883681535857e-05, "loss": 1.0371, "step": 6660 }, { "epoch": 1.2554112554112553, "grad_norm": 11.859658241271973, "learning_rate": 1.4978354978354978e-05, "loss": 1.3456, "step": 6670 }, { "epoch": 1.2572934312064747, "grad_norm": 14.336831092834473, "learning_rate": 1.4970826275174101e-05, "loss": 1.5135, "step": 6680 }, { "epoch": 1.259175607001694, "grad_norm": 21.42241096496582, "learning_rate": 1.4963297571993224e-05, "loss": 1.1505, "step": 6690 }, { "epoch": 1.2610577827969132, "grad_norm": 10.227978706359863, "learning_rate": 1.4955768868812349e-05, "loss": 1.4948, "step": 6700 }, { "epoch": 1.2629399585921326, "grad_norm": 11.469473838806152, "learning_rate": 1.4948240165631472e-05, "loss": 1.2275, "step": 6710 }, { "epoch": 1.2648221343873518, "grad_norm": 31.51825714111328, "learning_rate": 1.4940711462450595e-05, "loss": 1.1679, "step": 6720 }, { "epoch": 1.266704310182571, "grad_norm": 8.267791748046875, "learning_rate": 1.4933182759269718e-05, "loss": 1.1554, "step": 6730 }, { "epoch": 1.2685864859777904, "grad_norm": 12.076141357421875, "learning_rate": 1.492565405608884e-05, "loss": 1.441, "step": 6740 }, { "epoch": 1.2704686617730097, "grad_norm": 8.693479537963867, "learning_rate": 1.4918125352907963e-05, "loss": 1.0406, "step": 6750 }, { "epoch": 1.2723508375682289, "grad_norm": 5.5710368156433105, "learning_rate": 1.4910596649727085e-05, "loss": 1.2251, "step": 6760 }, { "epoch": 1.274233013363448, "grad_norm": 19.289039611816406, "learning_rate": 1.4903067946546208e-05, "loss": 1.3377, "step": 6770 }, { "epoch": 1.2761151891586673, "grad_norm": 14.665457725524902, "learning_rate": 1.489553924336533e-05, "loss": 1.2305, "step": 6780 }, { "epoch": 1.2779973649538867, "grad_norm": 6.723300933837891, "learning_rate": 1.4888010540184455e-05, "loss": 1.3505, "step": 6790 }, { "epoch": 1.279879540749106, "grad_norm": 165.9130859375, "learning_rate": 1.4880481837003578e-05, "loss": 1.1513, "step": 6800 }, { "epoch": 1.2817617165443251, "grad_norm": 11.876227378845215, "learning_rate": 1.4872953133822701e-05, "loss": 1.1086, "step": 6810 }, { "epoch": 1.2836438923395446, "grad_norm": 5.059904098510742, "learning_rate": 1.4865424430641824e-05, "loss": 1.1162, "step": 6820 }, { "epoch": 1.2855260681347638, "grad_norm": 39.77632522583008, "learning_rate": 1.4857895727460947e-05, "loss": 1.1485, "step": 6830 }, { "epoch": 1.287408243929983, "grad_norm": 17.867996215820312, "learning_rate": 1.4850367024280068e-05, "loss": 1.0449, "step": 6840 }, { "epoch": 1.2892904197252024, "grad_norm": 4.805659294128418, "learning_rate": 1.4842838321099191e-05, "loss": 1.1365, "step": 6850 }, { "epoch": 1.2911725955204216, "grad_norm": 4.266025543212891, "learning_rate": 1.4835309617918314e-05, "loss": 1.4389, "step": 6860 }, { "epoch": 1.2930547713156408, "grad_norm": 6.8787455558776855, "learning_rate": 1.4827780914737437e-05, "loss": 1.2298, "step": 6870 }, { "epoch": 1.2949369471108603, "grad_norm": 8.046527862548828, "learning_rate": 1.4820252211556561e-05, "loss": 1.1355, "step": 6880 }, { "epoch": 1.2968191229060795, "grad_norm": 13.009222984313965, "learning_rate": 1.4812723508375684e-05, "loss": 1.2157, "step": 6890 }, { "epoch": 1.2987012987012987, "grad_norm": 16.49818992614746, "learning_rate": 1.4805194805194807e-05, "loss": 1.4171, "step": 6900 }, { "epoch": 1.300583474496518, "grad_norm": 19.328983306884766, "learning_rate": 1.479766610201393e-05, "loss": 1.4212, "step": 6910 }, { "epoch": 1.3024656502917373, "grad_norm": 19.792497634887695, "learning_rate": 1.4790137398833053e-05, "loss": 1.0885, "step": 6920 }, { "epoch": 1.3043478260869565, "grad_norm": 8.283934593200684, "learning_rate": 1.4782608695652174e-05, "loss": 1.0667, "step": 6930 }, { "epoch": 1.3062300018821758, "grad_norm": 5.211995601654053, "learning_rate": 1.4775079992471297e-05, "loss": 1.1669, "step": 6940 }, { "epoch": 1.308112177677395, "grad_norm": 20.76984214782715, "learning_rate": 1.476755128929042e-05, "loss": 1.5103, "step": 6950 }, { "epoch": 1.3099943534726144, "grad_norm": 45.730899810791016, "learning_rate": 1.4760022586109543e-05, "loss": 1.0752, "step": 6960 }, { "epoch": 1.3118765292678336, "grad_norm": 8.765107154846191, "learning_rate": 1.4752493882928666e-05, "loss": 1.2052, "step": 6970 }, { "epoch": 1.3137587050630528, "grad_norm": 46.306983947753906, "learning_rate": 1.474496517974779e-05, "loss": 1.4834, "step": 6980 }, { "epoch": 1.3156408808582722, "grad_norm": 9.06339168548584, "learning_rate": 1.4737436476566914e-05, "loss": 1.137, "step": 6990 }, { "epoch": 1.3175230566534915, "grad_norm": 12.679614067077637, "learning_rate": 1.4729907773386036e-05, "loss": 1.1793, "step": 7000 }, { "epoch": 1.3194052324487107, "grad_norm": 7.356766223907471, "learning_rate": 1.472237907020516e-05, "loss": 1.1861, "step": 7010 }, { "epoch": 1.32128740824393, "grad_norm": 17.430709838867188, "learning_rate": 1.471485036702428e-05, "loss": 1.0291, "step": 7020 }, { "epoch": 1.3231695840391493, "grad_norm": 11.125081062316895, "learning_rate": 1.4707321663843404e-05, "loss": 1.2338, "step": 7030 }, { "epoch": 1.3250517598343685, "grad_norm": 17.22634506225586, "learning_rate": 1.4699792960662526e-05, "loss": 1.2741, "step": 7040 }, { "epoch": 1.3269339356295877, "grad_norm": 37.80469512939453, "learning_rate": 1.469226425748165e-05, "loss": 1.2436, "step": 7050 }, { "epoch": 1.3288161114248072, "grad_norm": 7.4153971672058105, "learning_rate": 1.4684735554300772e-05, "loss": 1.0263, "step": 7060 }, { "epoch": 1.3306982872200264, "grad_norm": 19.874597549438477, "learning_rate": 1.4677206851119897e-05, "loss": 1.142, "step": 7070 }, { "epoch": 1.3325804630152456, "grad_norm": 18.514259338378906, "learning_rate": 1.466967814793902e-05, "loss": 1.3683, "step": 7080 }, { "epoch": 1.3344626388104648, "grad_norm": 25.53579330444336, "learning_rate": 1.4662149444758143e-05, "loss": 1.3415, "step": 7090 }, { "epoch": 1.3363448146056842, "grad_norm": 17.759546279907227, "learning_rate": 1.4654620741577264e-05, "loss": 1.6721, "step": 7100 }, { "epoch": 1.3382269904009034, "grad_norm": 7.8694915771484375, "learning_rate": 1.4647092038396387e-05, "loss": 1.0611, "step": 7110 }, { "epoch": 1.3401091661961226, "grad_norm": 16.278396606445312, "learning_rate": 1.463956333521551e-05, "loss": 1.1083, "step": 7120 }, { "epoch": 1.341991341991342, "grad_norm": 7.207310676574707, "learning_rate": 1.4632034632034633e-05, "loss": 0.996, "step": 7130 }, { "epoch": 1.3438735177865613, "grad_norm": 10.18614673614502, "learning_rate": 1.4624505928853756e-05, "loss": 1.2492, "step": 7140 }, { "epoch": 1.3457556935817805, "grad_norm": 11.438788414001465, "learning_rate": 1.4616977225672879e-05, "loss": 1.2819, "step": 7150 }, { "epoch": 1.347637869377, "grad_norm": 14.186439514160156, "learning_rate": 1.4609448522492003e-05, "loss": 1.2256, "step": 7160 }, { "epoch": 1.3495200451722191, "grad_norm": 9.92445182800293, "learning_rate": 1.4601919819311126e-05, "loss": 1.1, "step": 7170 }, { "epoch": 1.3514022209674383, "grad_norm": 10.874330520629883, "learning_rate": 1.4594391116130249e-05, "loss": 1.3286, "step": 7180 }, { "epoch": 1.3532843967626578, "grad_norm": 4.418787956237793, "learning_rate": 1.458686241294937e-05, "loss": 1.4054, "step": 7190 }, { "epoch": 1.355166572557877, "grad_norm": 17.510101318359375, "learning_rate": 1.4579333709768493e-05, "loss": 1.1078, "step": 7200 }, { "epoch": 1.3570487483530962, "grad_norm": 10.508594512939453, "learning_rate": 1.4571805006587616e-05, "loss": 1.1085, "step": 7210 }, { "epoch": 1.3589309241483154, "grad_norm": 14.85457706451416, "learning_rate": 1.4564276303406739e-05, "loss": 1.1159, "step": 7220 }, { "epoch": 1.3608130999435346, "grad_norm": 5.535599231719971, "learning_rate": 1.4556747600225862e-05, "loss": 1.1347, "step": 7230 }, { "epoch": 1.362695275738754, "grad_norm": 9.540611267089844, "learning_rate": 1.4549218897044985e-05, "loss": 1.1789, "step": 7240 }, { "epoch": 1.3645774515339733, "grad_norm": 18.45024299621582, "learning_rate": 1.4541690193864108e-05, "loss": 1.0515, "step": 7250 }, { "epoch": 1.3664596273291925, "grad_norm": 79.7477035522461, "learning_rate": 1.4534161490683232e-05, "loss": 1.1787, "step": 7260 }, { "epoch": 1.368341803124412, "grad_norm": 3.590022087097168, "learning_rate": 1.4526632787502355e-05, "loss": 1.2198, "step": 7270 }, { "epoch": 1.370223978919631, "grad_norm": 4.568202018737793, "learning_rate": 1.4519104084321477e-05, "loss": 1.32, "step": 7280 }, { "epoch": 1.3721061547148503, "grad_norm": 12.585639953613281, "learning_rate": 1.45115753811406e-05, "loss": 1.1478, "step": 7290 }, { "epoch": 1.3739883305100697, "grad_norm": 7.819540500640869, "learning_rate": 1.4504046677959722e-05, "loss": 1.3724, "step": 7300 }, { "epoch": 1.375870506305289, "grad_norm": 14.75216293334961, "learning_rate": 1.4496517974778845e-05, "loss": 1.2092, "step": 7310 }, { "epoch": 1.3777526821005082, "grad_norm": 25.505807876586914, "learning_rate": 1.4488989271597968e-05, "loss": 1.2517, "step": 7320 }, { "epoch": 1.3796348578957276, "grad_norm": 5.387889862060547, "learning_rate": 1.4481460568417091e-05, "loss": 0.861, "step": 7330 }, { "epoch": 1.3815170336909468, "grad_norm": 2.2768774032592773, "learning_rate": 1.4473931865236214e-05, "loss": 1.1739, "step": 7340 }, { "epoch": 1.383399209486166, "grad_norm": 10.858841896057129, "learning_rate": 1.4466403162055339e-05, "loss": 1.217, "step": 7350 }, { "epoch": 1.3852813852813852, "grad_norm": 21.92953872680664, "learning_rate": 1.4458874458874458e-05, "loss": 1.0472, "step": 7360 }, { "epoch": 1.3871635610766044, "grad_norm": 10.877849578857422, "learning_rate": 1.4451345755693583e-05, "loss": 1.199, "step": 7370 }, { "epoch": 1.3890457368718239, "grad_norm": 7.3010149002075195, "learning_rate": 1.4443817052512706e-05, "loss": 1.3164, "step": 7380 }, { "epoch": 1.390927912667043, "grad_norm": 3.245511293411255, "learning_rate": 1.4436288349331829e-05, "loss": 0.9705, "step": 7390 }, { "epoch": 1.3928100884622623, "grad_norm": 4.765195846557617, "learning_rate": 1.4428759646150952e-05, "loss": 1.0409, "step": 7400 }, { "epoch": 1.3946922642574817, "grad_norm": 9.110795021057129, "learning_rate": 1.4421230942970075e-05, "loss": 1.0267, "step": 7410 }, { "epoch": 1.396574440052701, "grad_norm": 11.141672134399414, "learning_rate": 1.4413702239789197e-05, "loss": 1.0451, "step": 7420 }, { "epoch": 1.3984566158479201, "grad_norm": 3.296548843383789, "learning_rate": 1.440617353660832e-05, "loss": 1.0522, "step": 7430 }, { "epoch": 1.4003387916431396, "grad_norm": 11.544855117797852, "learning_rate": 1.4398644833427445e-05, "loss": 1.097, "step": 7440 }, { "epoch": 1.4022209674383588, "grad_norm": 9.588610649108887, "learning_rate": 1.4391116130246565e-05, "loss": 1.0146, "step": 7450 }, { "epoch": 1.404103143233578, "grad_norm": 17.363183975219727, "learning_rate": 1.4383587427065689e-05, "loss": 1.2154, "step": 7460 }, { "epoch": 1.4059853190287974, "grad_norm": 6.3251495361328125, "learning_rate": 1.4376058723884812e-05, "loss": 1.0566, "step": 7470 }, { "epoch": 1.4078674948240166, "grad_norm": 2.8543460369110107, "learning_rate": 1.4368530020703935e-05, "loss": 1.1234, "step": 7480 }, { "epoch": 1.4097496706192358, "grad_norm": 41.35597229003906, "learning_rate": 1.4361001317523058e-05, "loss": 1.4789, "step": 7490 }, { "epoch": 1.411631846414455, "grad_norm": 8.920171737670898, "learning_rate": 1.435347261434218e-05, "loss": 1.0124, "step": 7500 }, { "epoch": 1.4135140222096743, "grad_norm": 9.12064266204834, "learning_rate": 1.4345943911161304e-05, "loss": 0.9119, "step": 7510 }, { "epoch": 1.4153961980048937, "grad_norm": 51.954063415527344, "learning_rate": 1.4338415207980427e-05, "loss": 1.0014, "step": 7520 }, { "epoch": 1.417278373800113, "grad_norm": 4.927883148193359, "learning_rate": 1.4330886504799551e-05, "loss": 1.3197, "step": 7530 }, { "epoch": 1.419160549595332, "grad_norm": 11.021320343017578, "learning_rate": 1.432335780161867e-05, "loss": 0.9996, "step": 7540 }, { "epoch": 1.4210427253905515, "grad_norm": 8.294415473937988, "learning_rate": 1.4315829098437794e-05, "loss": 1.1363, "step": 7550 }, { "epoch": 1.4229249011857708, "grad_norm": 12.463217735290527, "learning_rate": 1.4308300395256918e-05, "loss": 1.1546, "step": 7560 }, { "epoch": 1.42480707698099, "grad_norm": 5.810076713562012, "learning_rate": 1.4300771692076041e-05, "loss": 1.3381, "step": 7570 }, { "epoch": 1.4266892527762094, "grad_norm": 7.27411413192749, "learning_rate": 1.4293242988895164e-05, "loss": 1.237, "step": 7580 }, { "epoch": 1.4285714285714286, "grad_norm": 8.202913284301758, "learning_rate": 1.4285714285714287e-05, "loss": 0.8892, "step": 7590 }, { "epoch": 1.4304536043666478, "grad_norm": 6.186138153076172, "learning_rate": 1.427818558253341e-05, "loss": 1.3076, "step": 7600 }, { "epoch": 1.4323357801618672, "grad_norm": 16.254478454589844, "learning_rate": 1.4270656879352533e-05, "loss": 1.0453, "step": 7610 }, { "epoch": 1.4342179559570865, "grad_norm": 5.887212753295898, "learning_rate": 1.4263128176171656e-05, "loss": 1.0952, "step": 7620 }, { "epoch": 1.4361001317523057, "grad_norm": 58.455604553222656, "learning_rate": 1.4255599472990777e-05, "loss": 1.1879, "step": 7630 }, { "epoch": 1.4379823075475249, "grad_norm": 6.455393314361572, "learning_rate": 1.42480707698099e-05, "loss": 0.881, "step": 7640 }, { "epoch": 1.439864483342744, "grad_norm": 6.461804389953613, "learning_rate": 1.4240542066629025e-05, "loss": 0.9318, "step": 7650 }, { "epoch": 1.4417466591379635, "grad_norm": 1.1490024328231812, "learning_rate": 1.4233013363448148e-05, "loss": 1.0784, "step": 7660 }, { "epoch": 1.4436288349331827, "grad_norm": 22.495023727416992, "learning_rate": 1.422548466026727e-05, "loss": 1.086, "step": 7670 }, { "epoch": 1.445511010728402, "grad_norm": 9.839690208435059, "learning_rate": 1.4217955957086393e-05, "loss": 0.9114, "step": 7680 }, { "epoch": 1.4473931865236214, "grad_norm": 9.664743423461914, "learning_rate": 1.4210427253905516e-05, "loss": 1.1183, "step": 7690 }, { "epoch": 1.4492753623188406, "grad_norm": 2.7624919414520264, "learning_rate": 1.420289855072464e-05, "loss": 1.0288, "step": 7700 }, { "epoch": 1.4511575381140598, "grad_norm": 2.4246561527252197, "learning_rate": 1.419536984754376e-05, "loss": 1.0188, "step": 7710 }, { "epoch": 1.4530397139092792, "grad_norm": 7.706203937530518, "learning_rate": 1.4187841144362883e-05, "loss": 1.0697, "step": 7720 }, { "epoch": 1.4549218897044984, "grad_norm": 14.255431175231934, "learning_rate": 1.4180312441182006e-05, "loss": 1.0956, "step": 7730 }, { "epoch": 1.4568040654997176, "grad_norm": 12.368314743041992, "learning_rate": 1.4172783738001131e-05, "loss": 1.0669, "step": 7740 }, { "epoch": 1.458686241294937, "grad_norm": 11.548198699951172, "learning_rate": 1.4165255034820254e-05, "loss": 1.0574, "step": 7750 }, { "epoch": 1.4605684170901563, "grad_norm": 8.626538276672363, "learning_rate": 1.4157726331639377e-05, "loss": 1.1298, "step": 7760 }, { "epoch": 1.4624505928853755, "grad_norm": 7.213776111602783, "learning_rate": 1.41501976284585e-05, "loss": 1.2538, "step": 7770 }, { "epoch": 1.4643327686805947, "grad_norm": 28.54906463623047, "learning_rate": 1.4142668925277623e-05, "loss": 1.009, "step": 7780 }, { "epoch": 1.466214944475814, "grad_norm": 7.344525337219238, "learning_rate": 1.4135140222096746e-05, "loss": 0.8592, "step": 7790 }, { "epoch": 1.4680971202710333, "grad_norm": 14.066884994506836, "learning_rate": 1.4127611518915867e-05, "loss": 0.9045, "step": 7800 }, { "epoch": 1.4699792960662525, "grad_norm": 7.654214382171631, "learning_rate": 1.412008281573499e-05, "loss": 0.9722, "step": 7810 }, { "epoch": 1.4718614718614718, "grad_norm": 4.771880149841309, "learning_rate": 1.4112554112554113e-05, "loss": 1.0675, "step": 7820 }, { "epoch": 1.4737436476566912, "grad_norm": 12.305994987487793, "learning_rate": 1.4105025409373236e-05, "loss": 1.1632, "step": 7830 }, { "epoch": 1.4756258234519104, "grad_norm": 4.854207992553711, "learning_rate": 1.409749670619236e-05, "loss": 0.9186, "step": 7840 }, { "epoch": 1.4775079992471296, "grad_norm": 11.940600395202637, "learning_rate": 1.4089968003011483e-05, "loss": 1.0449, "step": 7850 }, { "epoch": 1.479390175042349, "grad_norm": 9.252134323120117, "learning_rate": 1.4082439299830606e-05, "loss": 1.0849, "step": 7860 }, { "epoch": 1.4812723508375683, "grad_norm": 91.1706771850586, "learning_rate": 1.4074910596649729e-05, "loss": 1.1134, "step": 7870 }, { "epoch": 1.4831545266327875, "grad_norm": 8.043750762939453, "learning_rate": 1.4067381893468852e-05, "loss": 1.0423, "step": 7880 }, { "epoch": 1.485036702428007, "grad_norm": 72.7489242553711, "learning_rate": 1.4059853190287973e-05, "loss": 0.9289, "step": 7890 }, { "epoch": 1.486918878223226, "grad_norm": 2.386556625366211, "learning_rate": 1.4052324487107096e-05, "loss": 1.0292, "step": 7900 }, { "epoch": 1.4888010540184453, "grad_norm": 25.438966751098633, "learning_rate": 1.4044795783926219e-05, "loss": 1.1286, "step": 7910 }, { "epoch": 1.4906832298136645, "grad_norm": 16.999435424804688, "learning_rate": 1.4037267080745342e-05, "loss": 1.0221, "step": 7920 }, { "epoch": 1.4925654056088837, "grad_norm": 14.521089553833008, "learning_rate": 1.4029738377564466e-05, "loss": 1.3035, "step": 7930 }, { "epoch": 1.4944475814041032, "grad_norm": 13.352957725524902, "learning_rate": 1.402220967438359e-05, "loss": 1.3361, "step": 7940 }, { "epoch": 1.4963297571993224, "grad_norm": 24.51966094970703, "learning_rate": 1.4014680971202712e-05, "loss": 1.0674, "step": 7950 }, { "epoch": 1.4982119329945416, "grad_norm": 4.013449668884277, "learning_rate": 1.4007152268021835e-05, "loss": 0.7399, "step": 7960 }, { "epoch": 1.500094108789761, "grad_norm": 8.218099594116211, "learning_rate": 1.3999623564840956e-05, "loss": 0.8721, "step": 7970 }, { "epoch": 1.5019762845849802, "grad_norm": 15.218042373657227, "learning_rate": 1.399209486166008e-05, "loss": 1.145, "step": 7980 }, { "epoch": 1.5038584603801994, "grad_norm": 16.717836380004883, "learning_rate": 1.3984566158479202e-05, "loss": 0.9064, "step": 7990 }, { "epoch": 1.5057406361754189, "grad_norm": 61.43841552734375, "learning_rate": 1.3977037455298325e-05, "loss": 0.9818, "step": 8000 }, { "epoch": 1.507622811970638, "grad_norm": 20.666854858398438, "learning_rate": 1.3969508752117448e-05, "loss": 0.9895, "step": 8010 }, { "epoch": 1.5095049877658573, "grad_norm": 36.14076614379883, "learning_rate": 1.3961980048936573e-05, "loss": 1.021, "step": 8020 }, { "epoch": 1.5113871635610767, "grad_norm": 18.77960777282715, "learning_rate": 1.3954451345755696e-05, "loss": 0.8724, "step": 8030 }, { "epoch": 1.513269339356296, "grad_norm": 6.997078895568848, "learning_rate": 1.3946922642574819e-05, "loss": 1.0927, "step": 8040 }, { "epoch": 1.5151515151515151, "grad_norm": 12.119707107543945, "learning_rate": 1.3939393939393942e-05, "loss": 0.9838, "step": 8050 }, { "epoch": 1.5170336909467346, "grad_norm": 28.478229522705078, "learning_rate": 1.3931865236213063e-05, "loss": 1.0145, "step": 8060 }, { "epoch": 1.5189158667419536, "grad_norm": 23.44718360900879, "learning_rate": 1.3924336533032186e-05, "loss": 0.9658, "step": 8070 }, { "epoch": 1.520798042537173, "grad_norm": 11.981337547302246, "learning_rate": 1.3916807829851309e-05, "loss": 1.0584, "step": 8080 }, { "epoch": 1.5226802183323922, "grad_norm": 9.942193984985352, "learning_rate": 1.3909279126670432e-05, "loss": 0.985, "step": 8090 }, { "epoch": 1.5245623941276114, "grad_norm": 6.533130645751953, "learning_rate": 1.3901750423489554e-05, "loss": 1.1664, "step": 8100 }, { "epoch": 1.5264445699228308, "grad_norm": 4.729006290435791, "learning_rate": 1.3894221720308679e-05, "loss": 1.0228, "step": 8110 }, { "epoch": 1.52832674571805, "grad_norm": 15.206283569335938, "learning_rate": 1.3886693017127802e-05, "loss": 1.2411, "step": 8120 }, { "epoch": 1.5302089215132693, "grad_norm": 16.714759826660156, "learning_rate": 1.3879164313946925e-05, "loss": 1.2209, "step": 8130 }, { "epoch": 1.5320910973084887, "grad_norm": 20.130807876586914, "learning_rate": 1.3871635610766048e-05, "loss": 1.0768, "step": 8140 }, { "epoch": 1.533973273103708, "grad_norm": 4.701301097869873, "learning_rate": 1.3864106907585169e-05, "loss": 0.9803, "step": 8150 }, { "epoch": 1.5358554488989271, "grad_norm": 23.33013153076172, "learning_rate": 1.3856578204404292e-05, "loss": 0.9926, "step": 8160 }, { "epoch": 1.5377376246941465, "grad_norm": 17.344905853271484, "learning_rate": 1.3849049501223415e-05, "loss": 1.2403, "step": 8170 }, { "epoch": 1.5396198004893658, "grad_norm": 4.836775302886963, "learning_rate": 1.3841520798042538e-05, "loss": 1.0559, "step": 8180 }, { "epoch": 1.541501976284585, "grad_norm": 13.495597839355469, "learning_rate": 1.383399209486166e-05, "loss": 0.995, "step": 8190 }, { "epoch": 1.5433841520798044, "grad_norm": 57.71432113647461, "learning_rate": 1.3826463391680784e-05, "loss": 1.0178, "step": 8200 }, { "epoch": 1.5452663278750234, "grad_norm": 16.587297439575195, "learning_rate": 1.3818934688499908e-05, "loss": 0.8892, "step": 8210 }, { "epoch": 1.5471485036702428, "grad_norm": 10.470789909362793, "learning_rate": 1.3811405985319031e-05, "loss": 0.8105, "step": 8220 }, { "epoch": 1.549030679465462, "grad_norm": 8.34557056427002, "learning_rate": 1.3803877282138154e-05, "loss": 1.214, "step": 8230 }, { "epoch": 1.5509128552606812, "grad_norm": 15.27405071258545, "learning_rate": 1.3796348578957275e-05, "loss": 0.8583, "step": 8240 }, { "epoch": 1.5527950310559007, "grad_norm": 10.931471824645996, "learning_rate": 1.3788819875776398e-05, "loss": 1.2385, "step": 8250 }, { "epoch": 1.5546772068511199, "grad_norm": 14.83597469329834, "learning_rate": 1.3781291172595521e-05, "loss": 0.8043, "step": 8260 }, { "epoch": 1.556559382646339, "grad_norm": 13.999913215637207, "learning_rate": 1.3773762469414644e-05, "loss": 0.8869, "step": 8270 }, { "epoch": 1.5584415584415585, "grad_norm": 2.012145519256592, "learning_rate": 1.3766233766233767e-05, "loss": 0.975, "step": 8280 }, { "epoch": 1.5603237342367777, "grad_norm": 21.390483856201172, "learning_rate": 1.375870506305289e-05, "loss": 1.3669, "step": 8290 }, { "epoch": 1.562205910031997, "grad_norm": 2.5964221954345703, "learning_rate": 1.3751176359872015e-05, "loss": 0.8333, "step": 8300 }, { "epoch": 1.5640880858272164, "grad_norm": 7.925995349884033, "learning_rate": 1.3743647656691137e-05, "loss": 1.3074, "step": 8310 }, { "epoch": 1.5659702616224356, "grad_norm": 11.784333229064941, "learning_rate": 1.3736118953510259e-05, "loss": 0.6514, "step": 8320 }, { "epoch": 1.5678524374176548, "grad_norm": 13.306680679321289, "learning_rate": 1.3728590250329382e-05, "loss": 0.9919, "step": 8330 }, { "epoch": 1.5697346132128742, "grad_norm": 4.451740264892578, "learning_rate": 1.3721061547148505e-05, "loss": 0.8925, "step": 8340 }, { "epoch": 1.5716167890080932, "grad_norm": 7.311500072479248, "learning_rate": 1.3713532843967627e-05, "loss": 0.8031, "step": 8350 }, { "epoch": 1.5734989648033126, "grad_norm": 20.71784210205078, "learning_rate": 1.370600414078675e-05, "loss": 1.1857, "step": 8360 }, { "epoch": 1.5753811405985318, "grad_norm": 10.758478164672852, "learning_rate": 1.3698475437605873e-05, "loss": 0.8431, "step": 8370 }, { "epoch": 1.577263316393751, "grad_norm": 7.647435665130615, "learning_rate": 1.3690946734424996e-05, "loss": 1.2386, "step": 8380 }, { "epoch": 1.5791454921889705, "grad_norm": 1.5989456176757812, "learning_rate": 1.368341803124412e-05, "loss": 1.154, "step": 8390 }, { "epoch": 1.5810276679841897, "grad_norm": 2.973799467086792, "learning_rate": 1.3675889328063244e-05, "loss": 0.8838, "step": 8400 }, { "epoch": 1.582909843779409, "grad_norm": 8.04106616973877, "learning_rate": 1.3668360624882365e-05, "loss": 1.0488, "step": 8410 }, { "epoch": 1.5847920195746283, "grad_norm": 23.760622024536133, "learning_rate": 1.3660831921701488e-05, "loss": 1.4356, "step": 8420 }, { "epoch": 1.5866741953698476, "grad_norm": 9.534244537353516, "learning_rate": 1.365330321852061e-05, "loss": 0.9192, "step": 8430 }, { "epoch": 1.5885563711650668, "grad_norm": 8.14865493774414, "learning_rate": 1.3645774515339734e-05, "loss": 1.0321, "step": 8440 }, { "epoch": 1.5904385469602862, "grad_norm": 5.678312301635742, "learning_rate": 1.3638245812158857e-05, "loss": 1.1119, "step": 8450 }, { "epoch": 1.5923207227555054, "grad_norm": 19.022024154663086, "learning_rate": 1.363071710897798e-05, "loss": 1.0776, "step": 8460 }, { "epoch": 1.5942028985507246, "grad_norm": 23.45431137084961, "learning_rate": 1.3623188405797103e-05, "loss": 0.9972, "step": 8470 }, { "epoch": 1.596085074345944, "grad_norm": 26.307212829589844, "learning_rate": 1.3615659702616225e-05, "loss": 0.9018, "step": 8480 }, { "epoch": 1.597967250141163, "grad_norm": 38.280635833740234, "learning_rate": 1.360813099943535e-05, "loss": 0.9647, "step": 8490 }, { "epoch": 1.5998494259363825, "grad_norm": 13.799212455749512, "learning_rate": 1.360060229625447e-05, "loss": 0.9421, "step": 8500 }, { "epoch": 1.601731601731602, "grad_norm": 3.608424425125122, "learning_rate": 1.3593073593073594e-05, "loss": 0.7064, "step": 8510 }, { "epoch": 1.6036137775268209, "grad_norm": 6.385775566101074, "learning_rate": 1.3585544889892717e-05, "loss": 0.8039, "step": 8520 }, { "epoch": 1.6054959533220403, "grad_norm": 13.767354965209961, "learning_rate": 1.357801618671184e-05, "loss": 0.9991, "step": 8530 }, { "epoch": 1.6073781291172595, "grad_norm": 4.132275581359863, "learning_rate": 1.3570487483530963e-05, "loss": 0.9002, "step": 8540 }, { "epoch": 1.6092603049124787, "grad_norm": 20.56551170349121, "learning_rate": 1.3562958780350086e-05, "loss": 1.114, "step": 8550 }, { "epoch": 1.6111424807076982, "grad_norm": 12.178905487060547, "learning_rate": 1.3555430077169209e-05, "loss": 0.9161, "step": 8560 }, { "epoch": 1.6130246565029174, "grad_norm": 9.772829055786133, "learning_rate": 1.3547901373988332e-05, "loss": 1.0685, "step": 8570 }, { "epoch": 1.6149068322981366, "grad_norm": 12.571735382080078, "learning_rate": 1.3540372670807453e-05, "loss": 0.8599, "step": 8580 }, { "epoch": 1.616789008093356, "grad_norm": 39.86054611206055, "learning_rate": 1.3532843967626576e-05, "loss": 0.8537, "step": 8590 }, { "epoch": 1.6186711838885752, "grad_norm": 40.22355651855469, "learning_rate": 1.35253152644457e-05, "loss": 1.1449, "step": 8600 }, { "epoch": 1.6205533596837944, "grad_norm": 13.236491203308105, "learning_rate": 1.3517786561264823e-05, "loss": 0.8977, "step": 8610 }, { "epoch": 1.6224355354790139, "grad_norm": 11.93606948852539, "learning_rate": 1.3510257858083946e-05, "loss": 0.8524, "step": 8620 }, { "epoch": 1.6243177112742329, "grad_norm": 11.476794242858887, "learning_rate": 1.350272915490307e-05, "loss": 1.1183, "step": 8630 }, { "epoch": 1.6261998870694523, "grad_norm": 8.892841339111328, "learning_rate": 1.3495200451722192e-05, "loss": 1.1046, "step": 8640 }, { "epoch": 1.6280820628646717, "grad_norm": 3.0642027854919434, "learning_rate": 1.3487671748541315e-05, "loss": 1.0486, "step": 8650 }, { "epoch": 1.6299642386598907, "grad_norm": 1.5371010303497314, "learning_rate": 1.3480143045360438e-05, "loss": 0.8492, "step": 8660 }, { "epoch": 1.6318464144551101, "grad_norm": 5.408362865447998, "learning_rate": 1.347261434217956e-05, "loss": 0.8306, "step": 8670 }, { "epoch": 1.6337285902503293, "grad_norm": 18.592185974121094, "learning_rate": 1.3465085638998682e-05, "loss": 1.0851, "step": 8680 }, { "epoch": 1.6356107660455486, "grad_norm": 5.693188667297363, "learning_rate": 1.3457556935817807e-05, "loss": 0.9415, "step": 8690 }, { "epoch": 1.637492941840768, "grad_norm": 17.856067657470703, "learning_rate": 1.345002823263693e-05, "loss": 1.0766, "step": 8700 }, { "epoch": 1.6393751176359872, "grad_norm": 22.922794342041016, "learning_rate": 1.3442499529456053e-05, "loss": 0.9564, "step": 8710 }, { "epoch": 1.6412572934312064, "grad_norm": 10.05300235748291, "learning_rate": 1.3434970826275176e-05, "loss": 0.7716, "step": 8720 }, { "epoch": 1.6431394692264258, "grad_norm": 20.4290771484375, "learning_rate": 1.3427442123094298e-05, "loss": 0.7658, "step": 8730 }, { "epoch": 1.645021645021645, "grad_norm": 5.071466445922852, "learning_rate": 1.3419913419913421e-05, "loss": 1.0068, "step": 8740 }, { "epoch": 1.6469038208168643, "grad_norm": 12.650185585021973, "learning_rate": 1.3412384716732544e-05, "loss": 1.2319, "step": 8750 }, { "epoch": 1.6487859966120837, "grad_norm": 27.368560791015625, "learning_rate": 1.3404856013551666e-05, "loss": 0.8958, "step": 8760 }, { "epoch": 1.6506681724073027, "grad_norm": 10.492888450622559, "learning_rate": 1.3397327310370788e-05, "loss": 1.0703, "step": 8770 }, { "epoch": 1.6525503482025221, "grad_norm": 15.578258514404297, "learning_rate": 1.3389798607189911e-05, "loss": 1.0944, "step": 8780 }, { "epoch": 1.6544325239977415, "grad_norm": 7.515652179718018, "learning_rate": 1.3382269904009036e-05, "loss": 1.3246, "step": 8790 }, { "epoch": 1.6563146997929605, "grad_norm": 9.480490684509277, "learning_rate": 1.3374741200828159e-05, "loss": 0.89, "step": 8800 }, { "epoch": 1.65819687558818, "grad_norm": 19.28694725036621, "learning_rate": 1.3367212497647282e-05, "loss": 0.6789, "step": 8810 }, { "epoch": 1.6600790513833992, "grad_norm": 7.083702564239502, "learning_rate": 1.3359683794466405e-05, "loss": 0.9053, "step": 8820 }, { "epoch": 1.6619612271786184, "grad_norm": 49.1333122253418, "learning_rate": 1.3352155091285528e-05, "loss": 1.0765, "step": 8830 }, { "epoch": 1.6638434029738378, "grad_norm": 3.065033197402954, "learning_rate": 1.334462638810465e-05, "loss": 0.9586, "step": 8840 }, { "epoch": 1.665725578769057, "grad_norm": 21.612653732299805, "learning_rate": 1.3337097684923772e-05, "loss": 1.0886, "step": 8850 }, { "epoch": 1.6676077545642762, "grad_norm": 24.604717254638672, "learning_rate": 1.3329568981742895e-05, "loss": 0.9681, "step": 8860 }, { "epoch": 1.6694899303594957, "grad_norm": 15.733131408691406, "learning_rate": 1.3322040278562018e-05, "loss": 1.0073, "step": 8870 }, { "epoch": 1.6713721061547149, "grad_norm": 30.626779556274414, "learning_rate": 1.3314511575381142e-05, "loss": 0.8748, "step": 8880 }, { "epoch": 1.673254281949934, "grad_norm": 7.506359100341797, "learning_rate": 1.3306982872200265e-05, "loss": 0.9653, "step": 8890 }, { "epoch": 1.6751364577451535, "grad_norm": 5.364382266998291, "learning_rate": 1.3299454169019388e-05, "loss": 1.0913, "step": 8900 }, { "epoch": 1.6770186335403725, "grad_norm": 7.690507411956787, "learning_rate": 1.3291925465838511e-05, "loss": 0.8708, "step": 8910 }, { "epoch": 1.678900809335592, "grad_norm": 5.078372478485107, "learning_rate": 1.3284396762657634e-05, "loss": 1.0961, "step": 8920 }, { "epoch": 1.6807829851308114, "grad_norm": 8.640311241149902, "learning_rate": 1.3276868059476755e-05, "loss": 1.1776, "step": 8930 }, { "epoch": 1.6826651609260304, "grad_norm": 8.303173065185547, "learning_rate": 1.3269339356295878e-05, "loss": 0.8504, "step": 8940 }, { "epoch": 1.6845473367212498, "grad_norm": 13.893898010253906, "learning_rate": 1.3261810653115001e-05, "loss": 0.9924, "step": 8950 }, { "epoch": 1.686429512516469, "grad_norm": 1.1722054481506348, "learning_rate": 1.3254281949934124e-05, "loss": 0.8902, "step": 8960 }, { "epoch": 1.6883116883116882, "grad_norm": 14.810260772705078, "learning_rate": 1.3246753246753249e-05, "loss": 0.9838, "step": 8970 }, { "epoch": 1.6901938641069076, "grad_norm": 6.508338451385498, "learning_rate": 1.3239224543572372e-05, "loss": 0.9525, "step": 8980 }, { "epoch": 1.6920760399021268, "grad_norm": 18.174537658691406, "learning_rate": 1.3231695840391494e-05, "loss": 1.2055, "step": 8990 }, { "epoch": 1.693958215697346, "grad_norm": 14.68772029876709, "learning_rate": 1.3224167137210617e-05, "loss": 0.7929, "step": 9000 }, { "epoch": 1.6958403914925655, "grad_norm": 8.183045387268066, "learning_rate": 1.321663843402974e-05, "loss": 1.0087, "step": 9010 }, { "epoch": 1.6977225672877847, "grad_norm": 29.93727684020996, "learning_rate": 1.3209109730848861e-05, "loss": 0.973, "step": 9020 }, { "epoch": 1.699604743083004, "grad_norm": 10.451583862304688, "learning_rate": 1.3201581027667984e-05, "loss": 0.868, "step": 9030 }, { "epoch": 1.7014869188782233, "grad_norm": 9.627779006958008, "learning_rate": 1.3194052324487107e-05, "loss": 0.9869, "step": 9040 }, { "epoch": 1.7033690946734426, "grad_norm": 15.182428359985352, "learning_rate": 1.318652362130623e-05, "loss": 0.9524, "step": 9050 }, { "epoch": 1.7052512704686618, "grad_norm": 18.331754684448242, "learning_rate": 1.3178994918125355e-05, "loss": 0.6643, "step": 9060 }, { "epoch": 1.7071334462638812, "grad_norm": 17.56291961669922, "learning_rate": 1.3171466214944478e-05, "loss": 1.0118, "step": 9070 }, { "epoch": 1.7090156220591002, "grad_norm": 31.774778366088867, "learning_rate": 1.31639375117636e-05, "loss": 0.8729, "step": 9080 }, { "epoch": 1.7108977978543196, "grad_norm": 31.31987953186035, "learning_rate": 1.3156408808582724e-05, "loss": 0.8254, "step": 9090 }, { "epoch": 1.7127799736495388, "grad_norm": 11.088515281677246, "learning_rate": 1.3148880105401847e-05, "loss": 0.9627, "step": 9100 }, { "epoch": 1.714662149444758, "grad_norm": 14.68917179107666, "learning_rate": 1.3141351402220968e-05, "loss": 0.8448, "step": 9110 }, { "epoch": 1.7165443252399775, "grad_norm": 8.825004577636719, "learning_rate": 1.313382269904009e-05, "loss": 1.0459, "step": 9120 }, { "epoch": 1.7184265010351967, "grad_norm": 5.718069076538086, "learning_rate": 1.3126293995859214e-05, "loss": 0.8609, "step": 9130 }, { "epoch": 1.7203086768304159, "grad_norm": 4.432228088378906, "learning_rate": 1.3118765292678337e-05, "loss": 0.7144, "step": 9140 }, { "epoch": 1.7221908526256353, "grad_norm": 17.18754768371582, "learning_rate": 1.311123658949746e-05, "loss": 1.2041, "step": 9150 }, { "epoch": 1.7240730284208545, "grad_norm": 31.874040603637695, "learning_rate": 1.3103707886316584e-05, "loss": 1.1127, "step": 9160 }, { "epoch": 1.7259552042160737, "grad_norm": 20.094186782836914, "learning_rate": 1.3096179183135707e-05, "loss": 1.0748, "step": 9170 }, { "epoch": 1.7278373800112932, "grad_norm": 11.178129196166992, "learning_rate": 1.308865047995483e-05, "loss": 1.1735, "step": 9180 }, { "epoch": 1.7297195558065124, "grad_norm": 15.87864875793457, "learning_rate": 1.3081121776773951e-05, "loss": 0.8207, "step": 9190 }, { "epoch": 1.7316017316017316, "grad_norm": 13.691402435302734, "learning_rate": 1.3073593073593074e-05, "loss": 1.0621, "step": 9200 }, { "epoch": 1.733483907396951, "grad_norm": 10.819106101989746, "learning_rate": 1.3066064370412197e-05, "loss": 1.1812, "step": 9210 }, { "epoch": 1.73536608319217, "grad_norm": 13.477234840393066, "learning_rate": 1.305853566723132e-05, "loss": 0.8893, "step": 9220 }, { "epoch": 1.7372482589873894, "grad_norm": 14.02323055267334, "learning_rate": 1.3051006964050443e-05, "loss": 0.9262, "step": 9230 }, { "epoch": 1.7391304347826086, "grad_norm": 18.27639389038086, "learning_rate": 1.3043478260869566e-05, "loss": 1.1554, "step": 9240 }, { "epoch": 1.7410126105778279, "grad_norm": 23.120223999023438, "learning_rate": 1.303594955768869e-05, "loss": 0.9298, "step": 9250 }, { "epoch": 1.7428947863730473, "grad_norm": 9.989583015441895, "learning_rate": 1.3028420854507813e-05, "loss": 0.9628, "step": 9260 }, { "epoch": 1.7447769621682665, "grad_norm": 1.417387843132019, "learning_rate": 1.3020892151326936e-05, "loss": 0.7619, "step": 9270 }, { "epoch": 1.7466591379634857, "grad_norm": 17.73150062561035, "learning_rate": 1.3013363448146057e-05, "loss": 0.8362, "step": 9280 }, { "epoch": 1.7485413137587051, "grad_norm": 22.77169418334961, "learning_rate": 1.300583474496518e-05, "loss": 1.1307, "step": 9290 }, { "epoch": 1.7504234895539243, "grad_norm": 14.768538475036621, "learning_rate": 1.2998306041784303e-05, "loss": 1.1565, "step": 9300 }, { "epoch": 1.7523056653491436, "grad_norm": 1.281310796737671, "learning_rate": 1.2990777338603426e-05, "loss": 0.7435, "step": 9310 }, { "epoch": 1.754187841144363, "grad_norm": 5.074652671813965, "learning_rate": 1.2983248635422549e-05, "loss": 0.7562, "step": 9320 }, { "epoch": 1.7560700169395822, "grad_norm": 24.476675033569336, "learning_rate": 1.2975719932241672e-05, "loss": 0.9345, "step": 9330 }, { "epoch": 1.7579521927348014, "grad_norm": 5.09979772567749, "learning_rate": 1.2968191229060797e-05, "loss": 0.8928, "step": 9340 }, { "epoch": 1.7598343685300208, "grad_norm": 18.816225051879883, "learning_rate": 1.296066252587992e-05, "loss": 0.6661, "step": 9350 }, { "epoch": 1.7617165443252398, "grad_norm": 14.58993911743164, "learning_rate": 1.2953133822699043e-05, "loss": 0.7788, "step": 9360 }, { "epoch": 1.7635987201204593, "grad_norm": 8.356761932373047, "learning_rate": 1.2945605119518164e-05, "loss": 0.9851, "step": 9370 }, { "epoch": 1.7654808959156785, "grad_norm": 19.27338218688965, "learning_rate": 1.2938076416337287e-05, "loss": 0.8405, "step": 9380 }, { "epoch": 1.7673630717108977, "grad_norm": 9.037829399108887, "learning_rate": 1.293054771315641e-05, "loss": 0.9395, "step": 9390 }, { "epoch": 1.7692452475061171, "grad_norm": 26.172557830810547, "learning_rate": 1.2923019009975533e-05, "loss": 1.0054, "step": 9400 }, { "epoch": 1.7711274233013363, "grad_norm": 17.3187313079834, "learning_rate": 1.2915490306794655e-05, "loss": 1.0615, "step": 9410 }, { "epoch": 1.7730095990965555, "grad_norm": 15.355307579040527, "learning_rate": 1.2907961603613778e-05, "loss": 0.8999, "step": 9420 }, { "epoch": 1.774891774891775, "grad_norm": 5.734640598297119, "learning_rate": 1.2900432900432901e-05, "loss": 0.6632, "step": 9430 }, { "epoch": 1.7767739506869942, "grad_norm": 15.65912914276123, "learning_rate": 1.2892904197252026e-05, "loss": 1.1037, "step": 9440 }, { "epoch": 1.7786561264822134, "grad_norm": 13.188482284545898, "learning_rate": 1.2885375494071149e-05, "loss": 1.1157, "step": 9450 }, { "epoch": 1.7805383022774328, "grad_norm": 5.424443244934082, "learning_rate": 1.287784679089027e-05, "loss": 0.8627, "step": 9460 }, { "epoch": 1.782420478072652, "grad_norm": 12.912461280822754, "learning_rate": 1.2870318087709393e-05, "loss": 0.6204, "step": 9470 }, { "epoch": 1.7843026538678712, "grad_norm": 7.88910436630249, "learning_rate": 1.2862789384528516e-05, "loss": 1.0365, "step": 9480 }, { "epoch": 1.7861848296630907, "grad_norm": 15.304657936096191, "learning_rate": 1.2855260681347639e-05, "loss": 1.1819, "step": 9490 }, { "epoch": 1.7880670054583097, "grad_norm": 1.5771760940551758, "learning_rate": 1.2847731978166762e-05, "loss": 0.7879, "step": 9500 }, { "epoch": 1.789949181253529, "grad_norm": 24.63703155517578, "learning_rate": 1.2840203274985885e-05, "loss": 0.8802, "step": 9510 }, { "epoch": 1.7918313570487485, "grad_norm": 16.861093521118164, "learning_rate": 1.2832674571805008e-05, "loss": 1.1376, "step": 9520 }, { "epoch": 1.7937135328439675, "grad_norm": 8.824302673339844, "learning_rate": 1.2825145868624132e-05, "loss": 0.9262, "step": 9530 }, { "epoch": 1.795595708639187, "grad_norm": 5.604579448699951, "learning_rate": 1.2817617165443252e-05, "loss": 1.1257, "step": 9540 }, { "epoch": 1.7974778844344061, "grad_norm": 15.983642578125, "learning_rate": 1.2810088462262376e-05, "loss": 0.94, "step": 9550 }, { "epoch": 1.7993600602296254, "grad_norm": 25.5296573638916, "learning_rate": 1.28025597590815e-05, "loss": 0.7451, "step": 9560 }, { "epoch": 1.8012422360248448, "grad_norm": 8.427245140075684, "learning_rate": 1.2795031055900622e-05, "loss": 0.8745, "step": 9570 }, { "epoch": 1.803124411820064, "grad_norm": 12.769954681396484, "learning_rate": 1.2787502352719745e-05, "loss": 1.1612, "step": 9580 }, { "epoch": 1.8050065876152832, "grad_norm": 12.274664878845215, "learning_rate": 1.2779973649538868e-05, "loss": 0.8479, "step": 9590 }, { "epoch": 1.8068887634105026, "grad_norm": 13.23458194732666, "learning_rate": 1.2772444946357991e-05, "loss": 0.7665, "step": 9600 }, { "epoch": 1.8087709392057219, "grad_norm": 61.233150482177734, "learning_rate": 1.2764916243177114e-05, "loss": 0.5963, "step": 9610 }, { "epoch": 1.810653115000941, "grad_norm": 20.53067970275879, "learning_rate": 1.2757387539996238e-05, "loss": 1.1002, "step": 9620 }, { "epoch": 1.8125352907961605, "grad_norm": 16.09151268005371, "learning_rate": 1.2749858836815358e-05, "loss": 1.1078, "step": 9630 }, { "epoch": 1.8144174665913795, "grad_norm": 4.28153133392334, "learning_rate": 1.2742330133634483e-05, "loss": 1.008, "step": 9640 }, { "epoch": 1.816299642386599, "grad_norm": 11.185966491699219, "learning_rate": 1.2734801430453606e-05, "loss": 1.1953, "step": 9650 }, { "epoch": 1.8181818181818183, "grad_norm": 4.550100326538086, "learning_rate": 1.2727272727272728e-05, "loss": 1.1886, "step": 9660 }, { "epoch": 1.8200639939770373, "grad_norm": 10.768220901489258, "learning_rate": 1.2719744024091851e-05, "loss": 1.1359, "step": 9670 }, { "epoch": 1.8219461697722568, "grad_norm": 10.578123092651367, "learning_rate": 1.2712215320910974e-05, "loss": 0.8207, "step": 9680 }, { "epoch": 1.823828345567476, "grad_norm": 25.884841918945312, "learning_rate": 1.2704686617730097e-05, "loss": 0.9762, "step": 9690 }, { "epoch": 1.8257105213626952, "grad_norm": 19.521249771118164, "learning_rate": 1.269715791454922e-05, "loss": 1.1357, "step": 9700 }, { "epoch": 1.8275926971579146, "grad_norm": 26.879844665527344, "learning_rate": 1.2689629211368343e-05, "loss": 1.2003, "step": 9710 }, { "epoch": 1.8294748729531338, "grad_norm": 11.771387100219727, "learning_rate": 1.2682100508187464e-05, "loss": 0.8366, "step": 9720 }, { "epoch": 1.831357048748353, "grad_norm": 3.633502960205078, "learning_rate": 1.2674571805006587e-05, "loss": 1.1486, "step": 9730 }, { "epoch": 1.8332392245435725, "grad_norm": 9.438892364501953, "learning_rate": 1.2667043101825712e-05, "loss": 0.9775, "step": 9740 }, { "epoch": 1.8351214003387917, "grad_norm": 9.89268684387207, "learning_rate": 1.2659514398644835e-05, "loss": 1.4666, "step": 9750 }, { "epoch": 1.8370035761340109, "grad_norm": 8.895881652832031, "learning_rate": 1.2651985695463958e-05, "loss": 0.8229, "step": 9760 }, { "epoch": 1.8388857519292303, "grad_norm": 18.56089210510254, "learning_rate": 1.264445699228308e-05, "loss": 0.8295, "step": 9770 }, { "epoch": 1.8407679277244493, "grad_norm": 11.154679298400879, "learning_rate": 1.2636928289102204e-05, "loss": 0.8866, "step": 9780 }, { "epoch": 1.8426501035196687, "grad_norm": 23.42184829711914, "learning_rate": 1.2629399585921326e-05, "loss": 1.0182, "step": 9790 }, { "epoch": 1.8445322793148882, "grad_norm": 11.54366683959961, "learning_rate": 1.2621870882740448e-05, "loss": 0.858, "step": 9800 }, { "epoch": 1.8464144551101072, "grad_norm": 32.32461929321289, "learning_rate": 1.261434217955957e-05, "loss": 0.8094, "step": 9810 }, { "epoch": 1.8482966309053266, "grad_norm": 17.969310760498047, "learning_rate": 1.2606813476378694e-05, "loss": 0.8606, "step": 9820 }, { "epoch": 1.8501788067005458, "grad_norm": 12.051008224487305, "learning_rate": 1.2599284773197818e-05, "loss": 0.8854, "step": 9830 }, { "epoch": 1.852060982495765, "grad_norm": 17.03795623779297, "learning_rate": 1.2591756070016941e-05, "loss": 1.0436, "step": 9840 }, { "epoch": 1.8539431582909844, "grad_norm": 21.712039947509766, "learning_rate": 1.2584227366836064e-05, "loss": 0.8406, "step": 9850 }, { "epoch": 1.8558253340862036, "grad_norm": 6.554458141326904, "learning_rate": 1.2576698663655187e-05, "loss": 0.9822, "step": 9860 }, { "epoch": 1.8577075098814229, "grad_norm": 17.28873634338379, "learning_rate": 1.256916996047431e-05, "loss": 0.8904, "step": 9870 }, { "epoch": 1.8595896856766423, "grad_norm": 6.655860424041748, "learning_rate": 1.2561641257293433e-05, "loss": 1.0043, "step": 9880 }, { "epoch": 1.8614718614718615, "grad_norm": 6.995543956756592, "learning_rate": 1.2554112554112554e-05, "loss": 1.1564, "step": 9890 }, { "epoch": 1.8633540372670807, "grad_norm": 18.659198760986328, "learning_rate": 1.2546583850931677e-05, "loss": 1.0026, "step": 9900 }, { "epoch": 1.8652362130623001, "grad_norm": 5.457601070404053, "learning_rate": 1.25390551477508e-05, "loss": 1.1253, "step": 9910 }, { "epoch": 1.8671183888575191, "grad_norm": 21.024433135986328, "learning_rate": 1.2531526444569924e-05, "loss": 1.077, "step": 9920 }, { "epoch": 1.8690005646527386, "grad_norm": 14.646206855773926, "learning_rate": 1.2523997741389047e-05, "loss": 0.6792, "step": 9930 }, { "epoch": 1.870882740447958, "grad_norm": 14.446063995361328, "learning_rate": 1.251646903820817e-05, "loss": 1.0064, "step": 9940 }, { "epoch": 1.872764916243177, "grad_norm": 19.97396469116211, "learning_rate": 1.2508940335027293e-05, "loss": 0.9491, "step": 9950 }, { "epoch": 1.8746470920383964, "grad_norm": 51.85739517211914, "learning_rate": 1.2501411631846416e-05, "loss": 0.9612, "step": 9960 }, { "epoch": 1.8765292678336156, "grad_norm": 4.516988277435303, "learning_rate": 1.2493882928665539e-05, "loss": 0.9424, "step": 9970 }, { "epoch": 1.8784114436288348, "grad_norm": 9.412343978881836, "learning_rate": 1.248635422548466e-05, "loss": 1.0825, "step": 9980 }, { "epoch": 1.8802936194240543, "grad_norm": 8.06126880645752, "learning_rate": 1.2478825522303783e-05, "loss": 0.7844, "step": 9990 }, { "epoch": 1.8821757952192735, "grad_norm": 5.933414459228516, "learning_rate": 1.2471296819122906e-05, "loss": 1.0082, "step": 10000 }, { "epoch": 1.8840579710144927, "grad_norm": 17.543575286865234, "learning_rate": 1.2463768115942029e-05, "loss": 0.8493, "step": 10010 }, { "epoch": 1.8859401468097121, "grad_norm": 10.805532455444336, "learning_rate": 1.2456239412761154e-05, "loss": 0.9843, "step": 10020 }, { "epoch": 1.8878223226049313, "grad_norm": 7.1772260665893555, "learning_rate": 1.2448710709580277e-05, "loss": 1.028, "step": 10030 }, { "epoch": 1.8897044984001505, "grad_norm": 20.371374130249023, "learning_rate": 1.24411820063994e-05, "loss": 0.8241, "step": 10040 }, { "epoch": 1.89158667419537, "grad_norm": 9.451379776000977, "learning_rate": 1.2433653303218522e-05, "loss": 1.057, "step": 10050 }, { "epoch": 1.8934688499905892, "grad_norm": 12.519644737243652, "learning_rate": 1.2426124600037644e-05, "loss": 0.8994, "step": 10060 }, { "epoch": 1.8953510257858084, "grad_norm": 2.6986730098724365, "learning_rate": 1.2418595896856767e-05, "loss": 0.7619, "step": 10070 }, { "epoch": 1.8972332015810278, "grad_norm": 21.105154037475586, "learning_rate": 1.241106719367589e-05, "loss": 1.0232, "step": 10080 }, { "epoch": 1.8991153773762468, "grad_norm": 12.050444602966309, "learning_rate": 1.2403538490495012e-05, "loss": 0.7324, "step": 10090 }, { "epoch": 1.9009975531714662, "grad_norm": 19.549890518188477, "learning_rate": 1.2396009787314135e-05, "loss": 1.2509, "step": 10100 }, { "epoch": 1.9028797289666854, "grad_norm": 12.393698692321777, "learning_rate": 1.238848108413326e-05, "loss": 0.982, "step": 10110 }, { "epoch": 1.9047619047619047, "grad_norm": 2.7038516998291016, "learning_rate": 1.2380952380952383e-05, "loss": 0.7445, "step": 10120 }, { "epoch": 1.906644080557124, "grad_norm": 14.256768226623535, "learning_rate": 1.2373423677771506e-05, "loss": 0.7698, "step": 10130 }, { "epoch": 1.9085262563523433, "grad_norm": 7.038768291473389, "learning_rate": 1.2365894974590629e-05, "loss": 0.6723, "step": 10140 }, { "epoch": 1.9104084321475625, "grad_norm": 11.244647026062012, "learning_rate": 1.235836627140975e-05, "loss": 1.0177, "step": 10150 }, { "epoch": 1.912290607942782, "grad_norm": 10.397238731384277, "learning_rate": 1.2350837568228873e-05, "loss": 1.1339, "step": 10160 }, { "epoch": 1.9141727837380011, "grad_norm": 15.638545036315918, "learning_rate": 1.2343308865047996e-05, "loss": 1.1344, "step": 10170 }, { "epoch": 1.9160549595332204, "grad_norm": 42.80206298828125, "learning_rate": 1.2335780161867119e-05, "loss": 0.9122, "step": 10180 }, { "epoch": 1.9179371353284398, "grad_norm": 13.100602149963379, "learning_rate": 1.2328251458686242e-05, "loss": 0.7872, "step": 10190 }, { "epoch": 1.919819311123659, "grad_norm": 4.41287088394165, "learning_rate": 1.2320722755505366e-05, "loss": 0.6886, "step": 10200 }, { "epoch": 1.9217014869188782, "grad_norm": 17.352832794189453, "learning_rate": 1.2313194052324489e-05, "loss": 0.7823, "step": 10210 }, { "epoch": 1.9235836627140976, "grad_norm": 8.47523021697998, "learning_rate": 1.2305665349143612e-05, "loss": 1.1246, "step": 10220 }, { "epoch": 1.9254658385093166, "grad_norm": 4.320629596710205, "learning_rate": 1.2298136645962735e-05, "loss": 1.0178, "step": 10230 }, { "epoch": 1.927348014304536, "grad_norm": 17.435626983642578, "learning_rate": 1.2290607942781856e-05, "loss": 1.2824, "step": 10240 }, { "epoch": 1.9292301900997553, "grad_norm": 18.203798294067383, "learning_rate": 1.2283079239600979e-05, "loss": 1.0708, "step": 10250 }, { "epoch": 1.9311123658949745, "grad_norm": 2.8385987281799316, "learning_rate": 1.2275550536420102e-05, "loss": 0.8138, "step": 10260 }, { "epoch": 1.932994541690194, "grad_norm": 17.48655128479004, "learning_rate": 1.2268021833239225e-05, "loss": 0.935, "step": 10270 }, { "epoch": 1.9348767174854131, "grad_norm": 7.490472793579102, "learning_rate": 1.2260493130058348e-05, "loss": 0.8238, "step": 10280 }, { "epoch": 1.9367588932806323, "grad_norm": 11.629676818847656, "learning_rate": 1.2252964426877473e-05, "loss": 0.816, "step": 10290 }, { "epoch": 1.9386410690758518, "grad_norm": 1.6730352640151978, "learning_rate": 1.2245435723696595e-05, "loss": 0.718, "step": 10300 }, { "epoch": 1.940523244871071, "grad_norm": 29.646865844726562, "learning_rate": 1.2237907020515718e-05, "loss": 0.948, "step": 10310 }, { "epoch": 1.9424054206662902, "grad_norm": 3.246751070022583, "learning_rate": 1.2230378317334841e-05, "loss": 1.0785, "step": 10320 }, { "epoch": 1.9442875964615096, "grad_norm": 14.672032356262207, "learning_rate": 1.2222849614153963e-05, "loss": 0.6108, "step": 10330 }, { "epoch": 1.9461697722567288, "grad_norm": 8.470264434814453, "learning_rate": 1.2215320910973085e-05, "loss": 0.9098, "step": 10340 }, { "epoch": 1.948051948051948, "grad_norm": 6.679137706756592, "learning_rate": 1.2207792207792208e-05, "loss": 0.6887, "step": 10350 }, { "epoch": 1.9499341238471675, "grad_norm": 17.44939613342285, "learning_rate": 1.2200263504611331e-05, "loss": 0.8049, "step": 10360 }, { "epoch": 1.9518162996423865, "grad_norm": 12.042229652404785, "learning_rate": 1.2192734801430454e-05, "loss": 0.6099, "step": 10370 }, { "epoch": 1.9536984754376059, "grad_norm": 22.439245223999023, "learning_rate": 1.2185206098249577e-05, "loss": 0.78, "step": 10380 }, { "epoch": 1.955580651232825, "grad_norm": 9.354933738708496, "learning_rate": 1.2177677395068702e-05, "loss": 0.8881, "step": 10390 }, { "epoch": 1.9574628270280443, "grad_norm": 9.417488098144531, "learning_rate": 1.2170148691887825e-05, "loss": 0.8555, "step": 10400 }, { "epoch": 1.9593450028232637, "grad_norm": 13.155538558959961, "learning_rate": 1.2162619988706946e-05, "loss": 1.0589, "step": 10410 }, { "epoch": 1.961227178618483, "grad_norm": 8.894850730895996, "learning_rate": 1.2155091285526069e-05, "loss": 0.8917, "step": 10420 }, { "epoch": 1.9631093544137022, "grad_norm": 4.504289627075195, "learning_rate": 1.2147562582345192e-05, "loss": 0.8084, "step": 10430 }, { "epoch": 1.9649915302089216, "grad_norm": 7.747999668121338, "learning_rate": 1.2140033879164315e-05, "loss": 0.7033, "step": 10440 }, { "epoch": 1.9668737060041408, "grad_norm": 28.37241554260254, "learning_rate": 1.2132505175983438e-05, "loss": 0.7308, "step": 10450 }, { "epoch": 1.96875588179936, "grad_norm": 10.770339965820312, "learning_rate": 1.212497647280256e-05, "loss": 1.0313, "step": 10460 }, { "epoch": 1.9706380575945794, "grad_norm": 16.21937370300293, "learning_rate": 1.2117447769621683e-05, "loss": 0.9229, "step": 10470 }, { "epoch": 1.9725202333897986, "grad_norm": 29.973360061645508, "learning_rate": 1.2109919066440808e-05, "loss": 0.972, "step": 10480 }, { "epoch": 1.9744024091850179, "grad_norm": 34.55480194091797, "learning_rate": 1.2102390363259931e-05, "loss": 0.9822, "step": 10490 }, { "epoch": 1.9762845849802373, "grad_norm": 18.8240909576416, "learning_rate": 1.2094861660079052e-05, "loss": 1.124, "step": 10500 }, { "epoch": 1.9781667607754563, "grad_norm": 5.787772178649902, "learning_rate": 1.2087332956898175e-05, "loss": 0.8912, "step": 10510 }, { "epoch": 1.9800489365706757, "grad_norm": 11.275154113769531, "learning_rate": 1.2079804253717298e-05, "loss": 0.8765, "step": 10520 }, { "epoch": 1.981931112365895, "grad_norm": 20.49886131286621, "learning_rate": 1.2072275550536421e-05, "loss": 0.9797, "step": 10530 }, { "epoch": 1.9838132881611141, "grad_norm": 16.942197799682617, "learning_rate": 1.2064746847355544e-05, "loss": 0.8569, "step": 10540 }, { "epoch": 1.9856954639563336, "grad_norm": 7.426819801330566, "learning_rate": 1.2057218144174667e-05, "loss": 0.6804, "step": 10550 }, { "epoch": 1.9875776397515528, "grad_norm": 16.51919174194336, "learning_rate": 1.204968944099379e-05, "loss": 0.6712, "step": 10560 }, { "epoch": 1.989459815546772, "grad_norm": 5.8685302734375, "learning_rate": 1.2042160737812914e-05, "loss": 0.9723, "step": 10570 }, { "epoch": 1.9913419913419914, "grad_norm": 21.12563705444336, "learning_rate": 1.2034632034632037e-05, "loss": 0.8149, "step": 10580 }, { "epoch": 1.9932241671372106, "grad_norm": 18.672962188720703, "learning_rate": 1.2027103331451157e-05, "loss": 0.8327, "step": 10590 }, { "epoch": 1.9951063429324298, "grad_norm": 15.490560531616211, "learning_rate": 1.2019574628270281e-05, "loss": 0.8651, "step": 10600 }, { "epoch": 1.9969885187276493, "grad_norm": 4.263712406158447, "learning_rate": 1.2012045925089404e-05, "loss": 0.9362, "step": 10610 }, { "epoch": 1.9988706945228685, "grad_norm": 11.25022029876709, "learning_rate": 1.2004517221908527e-05, "loss": 0.7861, "step": 10620 }, { "epoch": 2.0, "eval_accuracy": 0.9056, "eval_loss": 0.4684549570083618, "eval_runtime": 110.542, "eval_samples_per_second": 67.848, "eval_steps_per_second": 8.485, "step": 10626 }, { "epoch": 2.0007528703180877, "grad_norm": 24.8381404876709, "learning_rate": 1.199698851872765e-05, "loss": 0.8849, "step": 10630 }, { "epoch": 2.002635046113307, "grad_norm": 16.988882064819336, "learning_rate": 1.1989459815546773e-05, "loss": 0.8274, "step": 10640 }, { "epoch": 2.004517221908526, "grad_norm": 8.939057350158691, "learning_rate": 1.1981931112365896e-05, "loss": 1.0752, "step": 10650 }, { "epoch": 2.0063993977037455, "grad_norm": 7.0386128425598145, "learning_rate": 1.1974402409185019e-05, "loss": 0.8478, "step": 10660 }, { "epoch": 2.008281573498965, "grad_norm": 7.87394380569458, "learning_rate": 1.196687370600414e-05, "loss": 1.1662, "step": 10670 }, { "epoch": 2.010163749294184, "grad_norm": 10.221199035644531, "learning_rate": 1.1959345002823263e-05, "loss": 0.9248, "step": 10680 }, { "epoch": 2.0120459250894034, "grad_norm": 10.589498519897461, "learning_rate": 1.1951816299642388e-05, "loss": 0.5548, "step": 10690 }, { "epoch": 2.013928100884623, "grad_norm": 4.692480087280273, "learning_rate": 1.194428759646151e-05, "loss": 0.9416, "step": 10700 }, { "epoch": 2.015810276679842, "grad_norm": 12.261754989624023, "learning_rate": 1.1936758893280634e-05, "loss": 1.0959, "step": 10710 }, { "epoch": 2.0176924524750612, "grad_norm": 3.541266441345215, "learning_rate": 1.1929230190099756e-05, "loss": 0.9546, "step": 10720 }, { "epoch": 2.0195746282702802, "grad_norm": 23.496295928955078, "learning_rate": 1.192170148691888e-05, "loss": 0.9216, "step": 10730 }, { "epoch": 2.0214568040654997, "grad_norm": 35.39616394042969, "learning_rate": 1.1914172783738002e-05, "loss": 0.8117, "step": 10740 }, { "epoch": 2.023338979860719, "grad_norm": 5.2047247886657715, "learning_rate": 1.1906644080557125e-05, "loss": 1.0543, "step": 10750 }, { "epoch": 2.025221155655938, "grad_norm": 16.47121238708496, "learning_rate": 1.1899115377376246e-05, "loss": 0.9308, "step": 10760 }, { "epoch": 2.0271033314511575, "grad_norm": 11.237469673156738, "learning_rate": 1.189158667419537e-05, "loss": 0.5998, "step": 10770 }, { "epoch": 2.028985507246377, "grad_norm": 15.104199409484863, "learning_rate": 1.1884057971014494e-05, "loss": 0.7011, "step": 10780 }, { "epoch": 2.030867683041596, "grad_norm": 12.535137176513672, "learning_rate": 1.1876529267833617e-05, "loss": 0.7919, "step": 10790 }, { "epoch": 2.0327498588368154, "grad_norm": 3.7836413383483887, "learning_rate": 1.186900056465274e-05, "loss": 0.7689, "step": 10800 }, { "epoch": 2.034632034632035, "grad_norm": 40.574642181396484, "learning_rate": 1.1861471861471863e-05, "loss": 0.9367, "step": 10810 }, { "epoch": 2.036514210427254, "grad_norm": 14.20531177520752, "learning_rate": 1.1853943158290986e-05, "loss": 0.6538, "step": 10820 }, { "epoch": 2.038396386222473, "grad_norm": 11.008893966674805, "learning_rate": 1.1846414455110109e-05, "loss": 0.9406, "step": 10830 }, { "epoch": 2.0402785620176926, "grad_norm": 13.851802825927734, "learning_rate": 1.1838885751929231e-05, "loss": 0.8025, "step": 10840 }, { "epoch": 2.0421607378129116, "grad_norm": 11.819645881652832, "learning_rate": 1.1831357048748353e-05, "loss": 0.7047, "step": 10850 }, { "epoch": 2.044042913608131, "grad_norm": 0.6497629284858704, "learning_rate": 1.1823828345567476e-05, "loss": 0.8094, "step": 10860 }, { "epoch": 2.04592508940335, "grad_norm": 10.60029411315918, "learning_rate": 1.18162996423866e-05, "loss": 0.7603, "step": 10870 }, { "epoch": 2.0478072651985695, "grad_norm": 10.798954010009766, "learning_rate": 1.1808770939205723e-05, "loss": 0.7551, "step": 10880 }, { "epoch": 2.049689440993789, "grad_norm": 1.9502023458480835, "learning_rate": 1.1801242236024846e-05, "loss": 0.7977, "step": 10890 }, { "epoch": 2.051571616789008, "grad_norm": 7.010662078857422, "learning_rate": 1.1793713532843969e-05, "loss": 1.0109, "step": 10900 }, { "epoch": 2.0534537925842273, "grad_norm": 7.077285289764404, "learning_rate": 1.1786184829663092e-05, "loss": 0.72, "step": 10910 }, { "epoch": 2.0553359683794468, "grad_norm": 10.952967643737793, "learning_rate": 1.1778656126482215e-05, "loss": 0.7789, "step": 10920 }, { "epoch": 2.0572181441746658, "grad_norm": 1.8068546056747437, "learning_rate": 1.1771127423301338e-05, "loss": 0.5414, "step": 10930 }, { "epoch": 2.059100319969885, "grad_norm": 9.510477066040039, "learning_rate": 1.1763598720120459e-05, "loss": 0.7369, "step": 10940 }, { "epoch": 2.0609824957651046, "grad_norm": 15.91817855834961, "learning_rate": 1.1756070016939582e-05, "loss": 0.8333, "step": 10950 }, { "epoch": 2.0628646715603236, "grad_norm": 8.938312530517578, "learning_rate": 1.1748541313758705e-05, "loss": 0.7071, "step": 10960 }, { "epoch": 2.064746847355543, "grad_norm": 5.337620258331299, "learning_rate": 1.174101261057783e-05, "loss": 0.5705, "step": 10970 }, { "epoch": 2.0666290231507625, "grad_norm": 16.76058578491211, "learning_rate": 1.1733483907396952e-05, "loss": 0.8773, "step": 10980 }, { "epoch": 2.0685111989459815, "grad_norm": 5.9690985679626465, "learning_rate": 1.1725955204216075e-05, "loss": 0.8072, "step": 10990 }, { "epoch": 2.070393374741201, "grad_norm": 1.3085390329360962, "learning_rate": 1.1718426501035198e-05, "loss": 1.1046, "step": 11000 }, { "epoch": 2.0722755505364203, "grad_norm": 10.01681137084961, "learning_rate": 1.1710897797854321e-05, "loss": 1.0356, "step": 11010 }, { "epoch": 2.0741577263316393, "grad_norm": 16.690166473388672, "learning_rate": 1.1703369094673442e-05, "loss": 0.4895, "step": 11020 }, { "epoch": 2.0760399021268587, "grad_norm": 13.10949993133545, "learning_rate": 1.1695840391492565e-05, "loss": 0.63, "step": 11030 }, { "epoch": 2.0779220779220777, "grad_norm": 12.242857933044434, "learning_rate": 1.1688311688311688e-05, "loss": 0.6262, "step": 11040 }, { "epoch": 2.079804253717297, "grad_norm": 7.687251567840576, "learning_rate": 1.1680782985130811e-05, "loss": 0.715, "step": 11050 }, { "epoch": 2.0816864295125166, "grad_norm": 32.919715881347656, "learning_rate": 1.1673254281949936e-05, "loss": 0.7352, "step": 11060 }, { "epoch": 2.0835686053077356, "grad_norm": 4.056097507476807, "learning_rate": 1.1665725578769059e-05, "loss": 1.0831, "step": 11070 }, { "epoch": 2.085450781102955, "grad_norm": 2.21692156791687, "learning_rate": 1.1658196875588182e-05, "loss": 0.8726, "step": 11080 }, { "epoch": 2.0873329568981744, "grad_norm": 19.382230758666992, "learning_rate": 1.1650668172407305e-05, "loss": 0.7071, "step": 11090 }, { "epoch": 2.0892151326933934, "grad_norm": 5.093869209289551, "learning_rate": 1.1643139469226427e-05, "loss": 0.8924, "step": 11100 }, { "epoch": 2.091097308488613, "grad_norm": 19.17929458618164, "learning_rate": 1.1635610766045549e-05, "loss": 0.8245, "step": 11110 }, { "epoch": 2.0929794842838323, "grad_norm": 26.134408950805664, "learning_rate": 1.1628082062864672e-05, "loss": 0.7664, "step": 11120 }, { "epoch": 2.0948616600790513, "grad_norm": 22.83806800842285, "learning_rate": 1.1620553359683795e-05, "loss": 0.901, "step": 11130 }, { "epoch": 2.0967438358742707, "grad_norm": 2.848191976547241, "learning_rate": 1.1613024656502917e-05, "loss": 0.8783, "step": 11140 }, { "epoch": 2.09862601166949, "grad_norm": 3.205889940261841, "learning_rate": 1.1605495953322042e-05, "loss": 0.8159, "step": 11150 }, { "epoch": 2.100508187464709, "grad_norm": 12.333131790161133, "learning_rate": 1.1597967250141165e-05, "loss": 0.7015, "step": 11160 }, { "epoch": 2.1023903632599286, "grad_norm": 3.75451922416687, "learning_rate": 1.1590438546960288e-05, "loss": 0.794, "step": 11170 }, { "epoch": 2.1042725390551476, "grad_norm": 13.827683448791504, "learning_rate": 1.158290984377941e-05, "loss": 0.7039, "step": 11180 }, { "epoch": 2.106154714850367, "grad_norm": 1.8480725288391113, "learning_rate": 1.1575381140598534e-05, "loss": 0.8531, "step": 11190 }, { "epoch": 2.1080368906455864, "grad_norm": 13.693140029907227, "learning_rate": 1.1567852437417655e-05, "loss": 1.3104, "step": 11200 }, { "epoch": 2.1099190664408054, "grad_norm": 11.603228569030762, "learning_rate": 1.1560323734236778e-05, "loss": 1.1264, "step": 11210 }, { "epoch": 2.111801242236025, "grad_norm": 9.185226440429688, "learning_rate": 1.15527950310559e-05, "loss": 0.9391, "step": 11220 }, { "epoch": 2.1136834180312443, "grad_norm": 17.381254196166992, "learning_rate": 1.1545266327875024e-05, "loss": 0.6598, "step": 11230 }, { "epoch": 2.1155655938264633, "grad_norm": 8.61845874786377, "learning_rate": 1.1537737624694147e-05, "loss": 0.7259, "step": 11240 }, { "epoch": 2.1174477696216827, "grad_norm": 9.44906997680664, "learning_rate": 1.1530208921513271e-05, "loss": 0.6785, "step": 11250 }, { "epoch": 2.119329945416902, "grad_norm": 19.07695770263672, "learning_rate": 1.1522680218332394e-05, "loss": 0.7864, "step": 11260 }, { "epoch": 2.121212121212121, "grad_norm": 16.53068733215332, "learning_rate": 1.1515151515151517e-05, "loss": 0.7151, "step": 11270 }, { "epoch": 2.1230942970073405, "grad_norm": 3.438575506210327, "learning_rate": 1.1507622811970638e-05, "loss": 0.6242, "step": 11280 }, { "epoch": 2.12497647280256, "grad_norm": 15.59850025177002, "learning_rate": 1.1500094108789761e-05, "loss": 0.9812, "step": 11290 }, { "epoch": 2.126858648597779, "grad_norm": 37.894386291503906, "learning_rate": 1.1492565405608884e-05, "loss": 0.8772, "step": 11300 }, { "epoch": 2.1287408243929984, "grad_norm": 5.406034469604492, "learning_rate": 1.1485036702428007e-05, "loss": 0.8852, "step": 11310 }, { "epoch": 2.1306230001882174, "grad_norm": 39.29657745361328, "learning_rate": 1.147750799924713e-05, "loss": 1.0962, "step": 11320 }, { "epoch": 2.132505175983437, "grad_norm": 8.793197631835938, "learning_rate": 1.1469979296066253e-05, "loss": 0.89, "step": 11330 }, { "epoch": 2.1343873517786562, "grad_norm": 11.008149147033691, "learning_rate": 1.1462450592885378e-05, "loss": 0.9019, "step": 11340 }, { "epoch": 2.1362695275738752, "grad_norm": 9.002307891845703, "learning_rate": 1.14549218897045e-05, "loss": 1.0197, "step": 11350 }, { "epoch": 2.1381517033690947, "grad_norm": 16.297245025634766, "learning_rate": 1.1447393186523623e-05, "loss": 1.0305, "step": 11360 }, { "epoch": 2.140033879164314, "grad_norm": 16.021282196044922, "learning_rate": 1.1439864483342745e-05, "loss": 0.7957, "step": 11370 }, { "epoch": 2.141916054959533, "grad_norm": 12.168352127075195, "learning_rate": 1.1432335780161868e-05, "loss": 0.7707, "step": 11380 }, { "epoch": 2.1437982307547525, "grad_norm": 28.62824058532715, "learning_rate": 1.142480707698099e-05, "loss": 0.9935, "step": 11390 }, { "epoch": 2.145680406549972, "grad_norm": 8.890032768249512, "learning_rate": 1.1417278373800113e-05, "loss": 0.7974, "step": 11400 }, { "epoch": 2.147562582345191, "grad_norm": 7.006708145141602, "learning_rate": 1.1409749670619236e-05, "loss": 1.1413, "step": 11410 }, { "epoch": 2.1494447581404104, "grad_norm": 11.208281517028809, "learning_rate": 1.140222096743836e-05, "loss": 0.8733, "step": 11420 }, { "epoch": 2.15132693393563, "grad_norm": 37.206809997558594, "learning_rate": 1.1394692264257484e-05, "loss": 0.7413, "step": 11430 }, { "epoch": 2.153209109730849, "grad_norm": 4.180663585662842, "learning_rate": 1.1387163561076607e-05, "loss": 0.9974, "step": 11440 }, { "epoch": 2.155091285526068, "grad_norm": 13.185440063476562, "learning_rate": 1.137963485789573e-05, "loss": 1.164, "step": 11450 }, { "epoch": 2.1569734613212876, "grad_norm": 14.27194595336914, "learning_rate": 1.1372106154714851e-05, "loss": 0.6064, "step": 11460 }, { "epoch": 2.1588556371165066, "grad_norm": 34.04643249511719, "learning_rate": 1.1364577451533974e-05, "loss": 0.6796, "step": 11470 }, { "epoch": 2.160737812911726, "grad_norm": 19.426414489746094, "learning_rate": 1.1357048748353097e-05, "loss": 0.806, "step": 11480 }, { "epoch": 2.162619988706945, "grad_norm": 8.01048469543457, "learning_rate": 1.134952004517222e-05, "loss": 0.7656, "step": 11490 }, { "epoch": 2.1645021645021645, "grad_norm": 16.02117919921875, "learning_rate": 1.1341991341991343e-05, "loss": 0.7809, "step": 11500 }, { "epoch": 2.166384340297384, "grad_norm": 4.275322914123535, "learning_rate": 1.1334462638810466e-05, "loss": 0.7501, "step": 11510 }, { "epoch": 2.168266516092603, "grad_norm": 26.736955642700195, "learning_rate": 1.132693393562959e-05, "loss": 0.7723, "step": 11520 }, { "epoch": 2.1701486918878223, "grad_norm": 7.618874549865723, "learning_rate": 1.1319405232448713e-05, "loss": 0.713, "step": 11530 }, { "epoch": 2.1720308676830418, "grad_norm": 10.420034408569336, "learning_rate": 1.1311876529267836e-05, "loss": 0.9617, "step": 11540 }, { "epoch": 2.1739130434782608, "grad_norm": 8.347007751464844, "learning_rate": 1.1304347826086957e-05, "loss": 0.7585, "step": 11550 }, { "epoch": 2.17579521927348, "grad_norm": 20.805870056152344, "learning_rate": 1.129681912290608e-05, "loss": 0.9854, "step": 11560 }, { "epoch": 2.1776773950686996, "grad_norm": 9.123351097106934, "learning_rate": 1.1289290419725203e-05, "loss": 0.8321, "step": 11570 }, { "epoch": 2.1795595708639186, "grad_norm": 11.314193725585938, "learning_rate": 1.1281761716544326e-05, "loss": 0.84, "step": 11580 }, { "epoch": 2.181441746659138, "grad_norm": 10.206731796264648, "learning_rate": 1.1274233013363449e-05, "loss": 0.5685, "step": 11590 }, { "epoch": 2.183323922454357, "grad_norm": 11.790088653564453, "learning_rate": 1.1266704310182572e-05, "loss": 0.7315, "step": 11600 }, { "epoch": 2.1852060982495765, "grad_norm": 15.779023170471191, "learning_rate": 1.1259175607001695e-05, "loss": 0.745, "step": 11610 }, { "epoch": 2.187088274044796, "grad_norm": 17.365673065185547, "learning_rate": 1.125164690382082e-05, "loss": 0.981, "step": 11620 }, { "epoch": 2.188970449840015, "grad_norm": 1.291931390762329, "learning_rate": 1.1244118200639939e-05, "loss": 1.0725, "step": 11630 }, { "epoch": 2.1908526256352343, "grad_norm": 17.065105438232422, "learning_rate": 1.1236589497459064e-05, "loss": 1.073, "step": 11640 }, { "epoch": 2.1927348014304537, "grad_norm": 4.217034816741943, "learning_rate": 1.1229060794278186e-05, "loss": 0.6659, "step": 11650 }, { "epoch": 2.1946169772256727, "grad_norm": 1.8660165071487427, "learning_rate": 1.122153209109731e-05, "loss": 0.6897, "step": 11660 }, { "epoch": 2.196499153020892, "grad_norm": 7.8330817222595215, "learning_rate": 1.1214003387916432e-05, "loss": 0.7756, "step": 11670 }, { "epoch": 2.1983813288161116, "grad_norm": 19.692419052124023, "learning_rate": 1.1206474684735555e-05, "loss": 1.0101, "step": 11680 }, { "epoch": 2.2002635046113306, "grad_norm": 43.92416000366211, "learning_rate": 1.1198945981554678e-05, "loss": 1.0972, "step": 11690 }, { "epoch": 2.20214568040655, "grad_norm": 1.8937373161315918, "learning_rate": 1.1191417278373801e-05, "loss": 0.8669, "step": 11700 }, { "epoch": 2.2040278562017694, "grad_norm": 20.478147506713867, "learning_rate": 1.1183888575192926e-05, "loss": 1.0169, "step": 11710 }, { "epoch": 2.2059100319969884, "grad_norm": 18.37138557434082, "learning_rate": 1.1176359872012045e-05, "loss": 0.6301, "step": 11720 }, { "epoch": 2.207792207792208, "grad_norm": 5.762269496917725, "learning_rate": 1.116883116883117e-05, "loss": 0.9702, "step": 11730 }, { "epoch": 2.2096743835874273, "grad_norm": 23.213031768798828, "learning_rate": 1.1161302465650293e-05, "loss": 0.6551, "step": 11740 }, { "epoch": 2.2115565593826463, "grad_norm": 19.10317039489746, "learning_rate": 1.1153773762469416e-05, "loss": 1.0178, "step": 11750 }, { "epoch": 2.2134387351778657, "grad_norm": 15.511590957641602, "learning_rate": 1.1146245059288539e-05, "loss": 0.799, "step": 11760 }, { "epoch": 2.2153209109730847, "grad_norm": 11.412705421447754, "learning_rate": 1.1138716356107661e-05, "loss": 1.0609, "step": 11770 }, { "epoch": 2.217203086768304, "grad_norm": 26.8338680267334, "learning_rate": 1.1131187652926784e-05, "loss": 0.9019, "step": 11780 }, { "epoch": 2.2190852625635236, "grad_norm": 9.526796340942383, "learning_rate": 1.1123658949745907e-05, "loss": 0.6681, "step": 11790 }, { "epoch": 2.2209674383587426, "grad_norm": 21.135456085205078, "learning_rate": 1.1116130246565032e-05, "loss": 0.5308, "step": 11800 }, { "epoch": 2.222849614153962, "grad_norm": 21.3120059967041, "learning_rate": 1.1108601543384151e-05, "loss": 0.8701, "step": 11810 }, { "epoch": 2.2247317899491814, "grad_norm": 8.938432693481445, "learning_rate": 1.1101072840203274e-05, "loss": 0.8294, "step": 11820 }, { "epoch": 2.2266139657444004, "grad_norm": 21.0710506439209, "learning_rate": 1.1093544137022399e-05, "loss": 0.5845, "step": 11830 }, { "epoch": 2.22849614153962, "grad_norm": 13.667383193969727, "learning_rate": 1.1086015433841522e-05, "loss": 1.137, "step": 11840 }, { "epoch": 2.2303783173348393, "grad_norm": 7.945625305175781, "learning_rate": 1.1078486730660645e-05, "loss": 0.774, "step": 11850 }, { "epoch": 2.2322604931300583, "grad_norm": 13.387373924255371, "learning_rate": 1.1070958027479768e-05, "loss": 0.6535, "step": 11860 }, { "epoch": 2.2341426689252777, "grad_norm": 16.70855140686035, "learning_rate": 1.106342932429889e-05, "loss": 0.6098, "step": 11870 }, { "epoch": 2.2360248447204967, "grad_norm": 19.224872589111328, "learning_rate": 1.1055900621118014e-05, "loss": 0.8751, "step": 11880 }, { "epoch": 2.237907020515716, "grad_norm": 12.4450044631958, "learning_rate": 1.1048371917937135e-05, "loss": 1.0692, "step": 11890 }, { "epoch": 2.2397891963109355, "grad_norm": 19.086071014404297, "learning_rate": 1.1040843214756258e-05, "loss": 0.9096, "step": 11900 }, { "epoch": 2.2416713721061545, "grad_norm": 18.77231788635254, "learning_rate": 1.103331451157538e-05, "loss": 0.8296, "step": 11910 }, { "epoch": 2.243553547901374, "grad_norm": 10.611273765563965, "learning_rate": 1.1025785808394505e-05, "loss": 0.7729, "step": 11920 }, { "epoch": 2.2454357236965934, "grad_norm": 19.948476791381836, "learning_rate": 1.1018257105213628e-05, "loss": 0.6645, "step": 11930 }, { "epoch": 2.2473178994918124, "grad_norm": 7.409848690032959, "learning_rate": 1.1010728402032751e-05, "loss": 0.6516, "step": 11940 }, { "epoch": 2.249200075287032, "grad_norm": 6.684335231781006, "learning_rate": 1.1003199698851874e-05, "loss": 0.986, "step": 11950 }, { "epoch": 2.2510822510822512, "grad_norm": 21.333091735839844, "learning_rate": 1.0995670995670997e-05, "loss": 0.7682, "step": 11960 }, { "epoch": 2.2529644268774702, "grad_norm": 4.745196342468262, "learning_rate": 1.098814229249012e-05, "loss": 0.488, "step": 11970 }, { "epoch": 2.2548466026726897, "grad_norm": 24.217205047607422, "learning_rate": 1.0980613589309241e-05, "loss": 0.5174, "step": 11980 }, { "epoch": 2.256728778467909, "grad_norm": 9.572256088256836, "learning_rate": 1.0973084886128364e-05, "loss": 0.7882, "step": 11990 }, { "epoch": 2.258610954263128, "grad_norm": 2.003173828125, "learning_rate": 1.0965556182947487e-05, "loss": 0.6915, "step": 12000 }, { "epoch": 2.2604931300583475, "grad_norm": 39.94436264038086, "learning_rate": 1.0958027479766612e-05, "loss": 0.9829, "step": 12010 }, { "epoch": 2.262375305853567, "grad_norm": 16.391490936279297, "learning_rate": 1.0950498776585735e-05, "loss": 0.792, "step": 12020 }, { "epoch": 2.264257481648786, "grad_norm": 19.83889389038086, "learning_rate": 1.0942970073404857e-05, "loss": 0.8692, "step": 12030 }, { "epoch": 2.2661396574440054, "grad_norm": 10.875959396362305, "learning_rate": 1.093544137022398e-05, "loss": 0.6063, "step": 12040 }, { "epoch": 2.2680218332392243, "grad_norm": 0.8151828646659851, "learning_rate": 1.0927912667043103e-05, "loss": 0.5955, "step": 12050 }, { "epoch": 2.269904009034444, "grad_norm": 4.661582946777344, "learning_rate": 1.0920383963862226e-05, "loss": 0.8415, "step": 12060 }, { "epoch": 2.271786184829663, "grad_norm": 6.517459392547607, "learning_rate": 1.0912855260681347e-05, "loss": 0.703, "step": 12070 }, { "epoch": 2.273668360624882, "grad_norm": 8.205318450927734, "learning_rate": 1.090532655750047e-05, "loss": 0.6762, "step": 12080 }, { "epoch": 2.2755505364201016, "grad_norm": 46.9409065246582, "learning_rate": 1.0897797854319593e-05, "loss": 0.6102, "step": 12090 }, { "epoch": 2.277432712215321, "grad_norm": 23.41012954711914, "learning_rate": 1.0890269151138718e-05, "loss": 0.787, "step": 12100 }, { "epoch": 2.27931488801054, "grad_norm": 7.724615097045898, "learning_rate": 1.088274044795784e-05, "loss": 0.9145, "step": 12110 }, { "epoch": 2.2811970638057595, "grad_norm": 10.711122512817383, "learning_rate": 1.0875211744776964e-05, "loss": 0.7992, "step": 12120 }, { "epoch": 2.283079239600979, "grad_norm": 6.124819278717041, "learning_rate": 1.0867683041596087e-05, "loss": 0.6344, "step": 12130 }, { "epoch": 2.284961415396198, "grad_norm": 27.855012893676758, "learning_rate": 1.086015433841521e-05, "loss": 0.6282, "step": 12140 }, { "epoch": 2.2868435911914173, "grad_norm": 7.3847832679748535, "learning_rate": 1.0852625635234333e-05, "loss": 0.7378, "step": 12150 }, { "epoch": 2.2887257669866363, "grad_norm": 13.684946060180664, "learning_rate": 1.0845096932053454e-05, "loss": 0.5045, "step": 12160 }, { "epoch": 2.2906079427818558, "grad_norm": 14.742804527282715, "learning_rate": 1.0837568228872577e-05, "loss": 1.0028, "step": 12170 }, { "epoch": 2.292490118577075, "grad_norm": 10.077812194824219, "learning_rate": 1.08300395256917e-05, "loss": 0.7794, "step": 12180 }, { "epoch": 2.2943722943722946, "grad_norm": 33.63645553588867, "learning_rate": 1.0822510822510823e-05, "loss": 0.9631, "step": 12190 }, { "epoch": 2.2962544701675136, "grad_norm": 9.247525215148926, "learning_rate": 1.0814982119329947e-05, "loss": 0.5459, "step": 12200 }, { "epoch": 2.298136645962733, "grad_norm": 5.322352409362793, "learning_rate": 1.080745341614907e-05, "loss": 0.8692, "step": 12210 }, { "epoch": 2.300018821757952, "grad_norm": 29.076679229736328, "learning_rate": 1.0799924712968193e-05, "loss": 0.8797, "step": 12220 }, { "epoch": 2.3019009975531715, "grad_norm": 18.286100387573242, "learning_rate": 1.0792396009787316e-05, "loss": 0.9282, "step": 12230 }, { "epoch": 2.303783173348391, "grad_norm": 8.712194442749023, "learning_rate": 1.0784867306606437e-05, "loss": 0.7751, "step": 12240 }, { "epoch": 2.30566534914361, "grad_norm": 2.8198611736297607, "learning_rate": 1.077733860342556e-05, "loss": 0.7057, "step": 12250 }, { "epoch": 2.3075475249388293, "grad_norm": 23.1691951751709, "learning_rate": 1.0769809900244683e-05, "loss": 0.8707, "step": 12260 }, { "epoch": 2.3094297007340487, "grad_norm": 10.105401992797852, "learning_rate": 1.0762281197063806e-05, "loss": 0.9284, "step": 12270 }, { "epoch": 2.3113118765292677, "grad_norm": 29.39499855041504, "learning_rate": 1.0754752493882929e-05, "loss": 0.6555, "step": 12280 }, { "epoch": 2.313194052324487, "grad_norm": 8.851414680480957, "learning_rate": 1.0747223790702053e-05, "loss": 0.9129, "step": 12290 }, { "epoch": 2.3150762281197066, "grad_norm": 1.137457251548767, "learning_rate": 1.0739695087521176e-05, "loss": 0.9343, "step": 12300 }, { "epoch": 2.3169584039149256, "grad_norm": 25.591732025146484, "learning_rate": 1.07321663843403e-05, "loss": 0.682, "step": 12310 }, { "epoch": 2.318840579710145, "grad_norm": 26.291501998901367, "learning_rate": 1.0724637681159422e-05, "loss": 0.8744, "step": 12320 }, { "epoch": 2.320722755505364, "grad_norm": 15.204377174377441, "learning_rate": 1.0717108977978543e-05, "loss": 0.6868, "step": 12330 }, { "epoch": 2.3226049313005834, "grad_norm": 26.76555633544922, "learning_rate": 1.0709580274797666e-05, "loss": 0.8418, "step": 12340 }, { "epoch": 2.324487107095803, "grad_norm": 22.4197940826416, "learning_rate": 1.070205157161679e-05, "loss": 1.1285, "step": 12350 }, { "epoch": 2.326369282891022, "grad_norm": 20.703449249267578, "learning_rate": 1.0694522868435912e-05, "loss": 0.841, "step": 12360 }, { "epoch": 2.3282514586862413, "grad_norm": 27.92598533630371, "learning_rate": 1.0686994165255035e-05, "loss": 0.572, "step": 12370 }, { "epoch": 2.3301336344814607, "grad_norm": 1.4635416269302368, "learning_rate": 1.067946546207416e-05, "loss": 0.7061, "step": 12380 }, { "epoch": 2.3320158102766797, "grad_norm": 12.686742782592773, "learning_rate": 1.0671936758893283e-05, "loss": 0.7331, "step": 12390 }, { "epoch": 2.333897986071899, "grad_norm": 4.224462509155273, "learning_rate": 1.0664408055712406e-05, "loss": 1.2657, "step": 12400 }, { "epoch": 2.3357801618671186, "grad_norm": 13.47118091583252, "learning_rate": 1.0656879352531528e-05, "loss": 0.8943, "step": 12410 }, { "epoch": 2.3376623376623376, "grad_norm": 17.81936264038086, "learning_rate": 1.064935064935065e-05, "loss": 0.7193, "step": 12420 }, { "epoch": 2.339544513457557, "grad_norm": 8.82054328918457, "learning_rate": 1.0641821946169773e-05, "loss": 0.9461, "step": 12430 }, { "epoch": 2.341426689252776, "grad_norm": 26.209758758544922, "learning_rate": 1.0634293242988896e-05, "loss": 0.8565, "step": 12440 }, { "epoch": 2.3433088650479954, "grad_norm": 13.844365119934082, "learning_rate": 1.0626764539808018e-05, "loss": 0.9119, "step": 12450 }, { "epoch": 2.345191040843215, "grad_norm": 0.3456774353981018, "learning_rate": 1.0619235836627141e-05, "loss": 1.1152, "step": 12460 }, { "epoch": 2.3470732166384343, "grad_norm": 1.4162580966949463, "learning_rate": 1.0611707133446264e-05, "loss": 0.6846, "step": 12470 }, { "epoch": 2.3489553924336533, "grad_norm": 9.784071922302246, "learning_rate": 1.0604178430265389e-05, "loss": 1.1308, "step": 12480 }, { "epoch": 2.3508375682288727, "grad_norm": 65.54403686523438, "learning_rate": 1.0596649727084512e-05, "loss": 0.6059, "step": 12490 }, { "epoch": 2.3527197440240917, "grad_norm": 18.23605728149414, "learning_rate": 1.0589121023903633e-05, "loss": 0.5438, "step": 12500 }, { "epoch": 2.354601919819311, "grad_norm": 19.891637802124023, "learning_rate": 1.0581592320722756e-05, "loss": 0.9351, "step": 12510 }, { "epoch": 2.3564840956145305, "grad_norm": 20.713233947753906, "learning_rate": 1.0574063617541879e-05, "loss": 0.6646, "step": 12520 }, { "epoch": 2.3583662714097495, "grad_norm": 14.681770324707031, "learning_rate": 1.0566534914361002e-05, "loss": 0.8591, "step": 12530 }, { "epoch": 2.360248447204969, "grad_norm": 6.22154426574707, "learning_rate": 1.0559006211180125e-05, "loss": 0.8724, "step": 12540 }, { "epoch": 2.3621306230001884, "grad_norm": 28.546422958374023, "learning_rate": 1.0551477507999248e-05, "loss": 0.8389, "step": 12550 }, { "epoch": 2.3640127987954074, "grad_norm": 8.232221603393555, "learning_rate": 1.054394880481837e-05, "loss": 0.7999, "step": 12560 }, { "epoch": 2.365894974590627, "grad_norm": 25.91827964782715, "learning_rate": 1.0536420101637495e-05, "loss": 0.5889, "step": 12570 }, { "epoch": 2.3677771503858462, "grad_norm": 7.587324142456055, "learning_rate": 1.0528891398456618e-05, "loss": 0.6031, "step": 12580 }, { "epoch": 2.3696593261810652, "grad_norm": 16.203954696655273, "learning_rate": 1.052136269527574e-05, "loss": 0.7866, "step": 12590 }, { "epoch": 2.3715415019762847, "grad_norm": 16.91657066345215, "learning_rate": 1.0513833992094862e-05, "loss": 0.9203, "step": 12600 }, { "epoch": 2.3734236777715036, "grad_norm": 12.624746322631836, "learning_rate": 1.0506305288913985e-05, "loss": 0.9539, "step": 12610 }, { "epoch": 2.375305853566723, "grad_norm": 12.346266746520996, "learning_rate": 1.0498776585733108e-05, "loss": 0.5703, "step": 12620 }, { "epoch": 2.3771880293619425, "grad_norm": 12.931204795837402, "learning_rate": 1.0491247882552231e-05, "loss": 0.656, "step": 12630 }, { "epoch": 2.3790702051571615, "grad_norm": 24.599132537841797, "learning_rate": 1.0483719179371354e-05, "loss": 1.234, "step": 12640 }, { "epoch": 2.380952380952381, "grad_norm": 12.25839614868164, "learning_rate": 1.0476190476190477e-05, "loss": 0.6236, "step": 12650 }, { "epoch": 2.3828345567476004, "grad_norm": 16.587228775024414, "learning_rate": 1.0468661773009602e-05, "loss": 0.6196, "step": 12660 }, { "epoch": 2.3847167325428194, "grad_norm": 8.345686912536621, "learning_rate": 1.0461133069828724e-05, "loss": 0.7598, "step": 12670 }, { "epoch": 2.386598908338039, "grad_norm": 12.944352149963379, "learning_rate": 1.0453604366647846e-05, "loss": 1.0841, "step": 12680 }, { "epoch": 2.388481084133258, "grad_norm": 26.68334197998047, "learning_rate": 1.0446075663466969e-05, "loss": 1.0751, "step": 12690 }, { "epoch": 2.390363259928477, "grad_norm": 1.97270929813385, "learning_rate": 1.0438546960286091e-05, "loss": 0.6852, "step": 12700 }, { "epoch": 2.3922454357236966, "grad_norm": 10.04570198059082, "learning_rate": 1.0431018257105214e-05, "loss": 0.6548, "step": 12710 }, { "epoch": 2.3941276115189156, "grad_norm": 17.121076583862305, "learning_rate": 1.0423489553924337e-05, "loss": 0.7934, "step": 12720 }, { "epoch": 2.396009787314135, "grad_norm": 21.22518539428711, "learning_rate": 1.041596085074346e-05, "loss": 1.0878, "step": 12730 }, { "epoch": 2.3978919631093545, "grad_norm": 14.835243225097656, "learning_rate": 1.0408432147562583e-05, "loss": 0.6626, "step": 12740 }, { "epoch": 2.399774138904574, "grad_norm": 24.89073371887207, "learning_rate": 1.0400903444381708e-05, "loss": 0.9652, "step": 12750 }, { "epoch": 2.401656314699793, "grad_norm": 13.480345726013184, "learning_rate": 1.0393374741200827e-05, "loss": 0.758, "step": 12760 }, { "epoch": 2.4035384904950123, "grad_norm": 26.335533142089844, "learning_rate": 1.038584603801995e-05, "loss": 0.9258, "step": 12770 }, { "epoch": 2.4054206662902313, "grad_norm": 17.967731475830078, "learning_rate": 1.0378317334839075e-05, "loss": 1.1077, "step": 12780 }, { "epoch": 2.4073028420854508, "grad_norm": 10.612001419067383, "learning_rate": 1.0370788631658198e-05, "loss": 0.7788, "step": 12790 }, { "epoch": 2.40918501788067, "grad_norm": 25.963218688964844, "learning_rate": 1.036325992847732e-05, "loss": 1.0074, "step": 12800 }, { "epoch": 2.411067193675889, "grad_norm": 9.171457290649414, "learning_rate": 1.0355731225296444e-05, "loss": 0.8116, "step": 12810 }, { "epoch": 2.4129493694711086, "grad_norm": 5.5040459632873535, "learning_rate": 1.0348202522115567e-05, "loss": 0.7237, "step": 12820 }, { "epoch": 2.414831545266328, "grad_norm": 12.755102157592773, "learning_rate": 1.034067381893469e-05, "loss": 0.7111, "step": 12830 }, { "epoch": 2.416713721061547, "grad_norm": 12.672661781311035, "learning_rate": 1.0333145115753812e-05, "loss": 0.5718, "step": 12840 }, { "epoch": 2.4185958968567665, "grad_norm": 14.201343536376953, "learning_rate": 1.0325616412572934e-05, "loss": 0.8472, "step": 12850 }, { "epoch": 2.420478072651986, "grad_norm": 11.615950584411621, "learning_rate": 1.0318087709392057e-05, "loss": 0.6249, "step": 12860 }, { "epoch": 2.422360248447205, "grad_norm": 33.66123962402344, "learning_rate": 1.0310559006211181e-05, "loss": 0.784, "step": 12870 }, { "epoch": 2.4242424242424243, "grad_norm": 20.7972354888916, "learning_rate": 1.0303030303030304e-05, "loss": 0.8194, "step": 12880 }, { "epoch": 2.4261246000376433, "grad_norm": 14.012480735778809, "learning_rate": 1.0295501599849427e-05, "loss": 1.0207, "step": 12890 }, { "epoch": 2.4280067758328627, "grad_norm": 28.325319290161133, "learning_rate": 1.028797289666855e-05, "loss": 0.8505, "step": 12900 }, { "epoch": 2.429888951628082, "grad_norm": 12.772529602050781, "learning_rate": 1.0280444193487673e-05, "loss": 0.7771, "step": 12910 }, { "epoch": 2.431771127423301, "grad_norm": 15.617521286010742, "learning_rate": 1.0272915490306796e-05, "loss": 0.6914, "step": 12920 }, { "epoch": 2.4336533032185206, "grad_norm": 9.929617881774902, "learning_rate": 1.0265386787125919e-05, "loss": 0.8166, "step": 12930 }, { "epoch": 2.43553547901374, "grad_norm": 9.618364334106445, "learning_rate": 1.025785808394504e-05, "loss": 0.7019, "step": 12940 }, { "epoch": 2.437417654808959, "grad_norm": 22.540725708007812, "learning_rate": 1.0250329380764163e-05, "loss": 0.8511, "step": 12950 }, { "epoch": 2.4392998306041784, "grad_norm": 7.289876461029053, "learning_rate": 1.0242800677583287e-05, "loss": 0.6807, "step": 12960 }, { "epoch": 2.441182006399398, "grad_norm": 18.872398376464844, "learning_rate": 1.023527197440241e-05, "loss": 0.9898, "step": 12970 }, { "epoch": 2.443064182194617, "grad_norm": 14.431037902832031, "learning_rate": 1.0227743271221533e-05, "loss": 0.961, "step": 12980 }, { "epoch": 2.4449463579898363, "grad_norm": 19.559343338012695, "learning_rate": 1.0220214568040656e-05, "loss": 0.7496, "step": 12990 }, { "epoch": 2.4468285337850557, "grad_norm": 5.90576171875, "learning_rate": 1.0212685864859779e-05, "loss": 0.7529, "step": 13000 }, { "epoch": 2.4487107095802747, "grad_norm": 18.241514205932617, "learning_rate": 1.0205157161678902e-05, "loss": 0.6666, "step": 13010 }, { "epoch": 2.450592885375494, "grad_norm": 30.32737159729004, "learning_rate": 1.0197628458498025e-05, "loss": 0.7313, "step": 13020 }, { "epoch": 2.4524750611707136, "grad_norm": 8.771208763122559, "learning_rate": 1.0190099755317146e-05, "loss": 0.4378, "step": 13030 }, { "epoch": 2.4543572369659326, "grad_norm": 21.53113555908203, "learning_rate": 1.0182571052136269e-05, "loss": 0.6384, "step": 13040 }, { "epoch": 2.456239412761152, "grad_norm": 14.424870491027832, "learning_rate": 1.0175042348955392e-05, "loss": 0.5578, "step": 13050 }, { "epoch": 2.458121588556371, "grad_norm": 21.395099639892578, "learning_rate": 1.0167513645774517e-05, "loss": 0.5507, "step": 13060 }, { "epoch": 2.4600037643515904, "grad_norm": 18.84751319885254, "learning_rate": 1.015998494259364e-05, "loss": 0.9829, "step": 13070 }, { "epoch": 2.46188594014681, "grad_norm": 7.269599437713623, "learning_rate": 1.0152456239412763e-05, "loss": 0.8354, "step": 13080 }, { "epoch": 2.463768115942029, "grad_norm": 14.933770179748535, "learning_rate": 1.0144927536231885e-05, "loss": 0.7661, "step": 13090 }, { "epoch": 2.4656502917372483, "grad_norm": 0.9064294695854187, "learning_rate": 1.0137398833051008e-05, "loss": 0.9594, "step": 13100 }, { "epoch": 2.4675324675324677, "grad_norm": 14.727800369262695, "learning_rate": 1.012987012987013e-05, "loss": 0.5136, "step": 13110 }, { "epoch": 2.4694146433276867, "grad_norm": 20.03411293029785, "learning_rate": 1.0122341426689252e-05, "loss": 1.0009, "step": 13120 }, { "epoch": 2.471296819122906, "grad_norm": 19.584400177001953, "learning_rate": 1.0114812723508375e-05, "loss": 0.8759, "step": 13130 }, { "epoch": 2.4731789949181255, "grad_norm": 9.938185691833496, "learning_rate": 1.0107284020327498e-05, "loss": 0.7959, "step": 13140 }, { "epoch": 2.4750611707133445, "grad_norm": 23.411479949951172, "learning_rate": 1.0099755317146623e-05, "loss": 0.7344, "step": 13150 }, { "epoch": 2.476943346508564, "grad_norm": 12.910082817077637, "learning_rate": 1.0092226613965746e-05, "loss": 0.6742, "step": 13160 }, { "epoch": 2.478825522303783, "grad_norm": 19.10327911376953, "learning_rate": 1.0084697910784869e-05, "loss": 0.8607, "step": 13170 }, { "epoch": 2.4807076980990024, "grad_norm": 8.324233055114746, "learning_rate": 1.0077169207603992e-05, "loss": 0.874, "step": 13180 }, { "epoch": 2.482589873894222, "grad_norm": 44.7740592956543, "learning_rate": 1.0069640504423115e-05, "loss": 0.6132, "step": 13190 }, { "epoch": 2.4844720496894412, "grad_norm": 24.905864715576172, "learning_rate": 1.0062111801242236e-05, "loss": 0.7348, "step": 13200 }, { "epoch": 2.4863542254846602, "grad_norm": 39.80661392211914, "learning_rate": 1.0054583098061359e-05, "loss": 0.7712, "step": 13210 }, { "epoch": 2.4882364012798797, "grad_norm": 14.922135353088379, "learning_rate": 1.0047054394880482e-05, "loss": 0.7724, "step": 13220 }, { "epoch": 2.4901185770750986, "grad_norm": 8.277451515197754, "learning_rate": 1.0039525691699605e-05, "loss": 0.8076, "step": 13230 }, { "epoch": 2.492000752870318, "grad_norm": 12.632638931274414, "learning_rate": 1.003199698851873e-05, "loss": 0.8422, "step": 13240 }, { "epoch": 2.4938829286655375, "grad_norm": 10.459776878356934, "learning_rate": 1.0024468285337852e-05, "loss": 0.7523, "step": 13250 }, { "epoch": 2.4957651044607565, "grad_norm": 7.631857395172119, "learning_rate": 1.0016939582156975e-05, "loss": 0.7041, "step": 13260 }, { "epoch": 2.497647280255976, "grad_norm": 25.433801651000977, "learning_rate": 1.0009410878976098e-05, "loss": 0.8024, "step": 13270 }, { "epoch": 2.4995294560511954, "grad_norm": 2.3861024379730225, "learning_rate": 1.0001882175795221e-05, "loss": 0.7084, "step": 13280 }, { "epoch": 2.5014116318464144, "grad_norm": 32.96226119995117, "learning_rate": 9.994353472614344e-06, "loss": 0.8429, "step": 13290 }, { "epoch": 2.503293807641634, "grad_norm": 22.255521774291992, "learning_rate": 9.986824769433467e-06, "loss": 0.5662, "step": 13300 }, { "epoch": 2.505175983436853, "grad_norm": 17.061006546020508, "learning_rate": 9.979296066252588e-06, "loss": 0.8483, "step": 13310 }, { "epoch": 2.507058159232072, "grad_norm": 16.66559410095215, "learning_rate": 9.971767363071711e-06, "loss": 0.6624, "step": 13320 }, { "epoch": 2.5089403350272916, "grad_norm": 16.982269287109375, "learning_rate": 9.964238659890836e-06, "loss": 0.9074, "step": 13330 }, { "epoch": 2.5108225108225106, "grad_norm": 8.781649589538574, "learning_rate": 9.956709956709958e-06, "loss": 0.9302, "step": 13340 }, { "epoch": 2.51270468661773, "grad_norm": 9.977226257324219, "learning_rate": 9.94918125352908e-06, "loss": 0.7217, "step": 13350 }, { "epoch": 2.5145868624129495, "grad_norm": 20.91251564025879, "learning_rate": 9.941652550348203e-06, "loss": 0.7181, "step": 13360 }, { "epoch": 2.516469038208169, "grad_norm": 15.54866886138916, "learning_rate": 9.934123847167326e-06, "loss": 0.4848, "step": 13370 }, { "epoch": 2.518351214003388, "grad_norm": 11.833428382873535, "learning_rate": 9.92659514398645e-06, "loss": 0.4919, "step": 13380 }, { "epoch": 2.5202333897986073, "grad_norm": 7.332128524780273, "learning_rate": 9.919066440805573e-06, "loss": 0.7247, "step": 13390 }, { "epoch": 2.5221155655938263, "grad_norm": 13.253988265991211, "learning_rate": 9.911537737624694e-06, "loss": 0.7385, "step": 13400 }, { "epoch": 2.5239977413890458, "grad_norm": 24.234617233276367, "learning_rate": 9.904009034443817e-06, "loss": 0.7513, "step": 13410 }, { "epoch": 2.525879917184265, "grad_norm": 24.290355682373047, "learning_rate": 9.89648033126294e-06, "loss": 1.0585, "step": 13420 }, { "epoch": 2.527762092979484, "grad_norm": 7.267026901245117, "learning_rate": 9.888951628082065e-06, "loss": 0.574, "step": 13430 }, { "epoch": 2.5296442687747036, "grad_norm": 5.2897443771362305, "learning_rate": 9.881422924901186e-06, "loss": 0.6149, "step": 13440 }, { "epoch": 2.5315264445699226, "grad_norm": 27.750844955444336, "learning_rate": 9.873894221720309e-06, "loss": 0.9253, "step": 13450 }, { "epoch": 2.533408620365142, "grad_norm": 12.20700454711914, "learning_rate": 9.866365518539432e-06, "loss": 0.7781, "step": 13460 }, { "epoch": 2.5352907961603615, "grad_norm": 13.96152400970459, "learning_rate": 9.858836815358556e-06, "loss": 0.8337, "step": 13470 }, { "epoch": 2.537172971955581, "grad_norm": 12.576264381408691, "learning_rate": 9.851308112177678e-06, "loss": 0.6645, "step": 13480 }, { "epoch": 2.5390551477508, "grad_norm": 25.116222381591797, "learning_rate": 9.8437794089968e-06, "loss": 1.2199, "step": 13490 }, { "epoch": 2.5409373235460193, "grad_norm": 18.44822120666504, "learning_rate": 9.836250705815924e-06, "loss": 0.5354, "step": 13500 }, { "epoch": 2.5428194993412383, "grad_norm": 12.855209350585938, "learning_rate": 9.828722002635046e-06, "loss": 0.7728, "step": 13510 }, { "epoch": 2.5447016751364577, "grad_norm": 25.006006240844727, "learning_rate": 9.821193299454171e-06, "loss": 0.8413, "step": 13520 }, { "epoch": 2.546583850931677, "grad_norm": 27.276927947998047, "learning_rate": 9.813664596273292e-06, "loss": 1.0349, "step": 13530 }, { "epoch": 2.548466026726896, "grad_norm": 13.0643949508667, "learning_rate": 9.806135893092415e-06, "loss": 0.7487, "step": 13540 }, { "epoch": 2.5503482025221156, "grad_norm": 15.080717086791992, "learning_rate": 9.798607189911538e-06, "loss": 0.9849, "step": 13550 }, { "epoch": 2.5522303783173346, "grad_norm": 7.237006664276123, "learning_rate": 9.791078486730661e-06, "loss": 0.8056, "step": 13560 }, { "epoch": 2.554112554112554, "grad_norm": 12.461982727050781, "learning_rate": 9.783549783549784e-06, "loss": 0.8994, "step": 13570 }, { "epoch": 2.5559947299077734, "grad_norm": 13.99405288696289, "learning_rate": 9.776021080368907e-06, "loss": 0.7762, "step": 13580 }, { "epoch": 2.557876905702993, "grad_norm": 12.452048301696777, "learning_rate": 9.76849237718803e-06, "loss": 1.1317, "step": 13590 }, { "epoch": 2.559759081498212, "grad_norm": 18.15245819091797, "learning_rate": 9.760963674007153e-06, "loss": 0.6629, "step": 13600 }, { "epoch": 2.5616412572934313, "grad_norm": 20.07953453063965, "learning_rate": 9.753434970826276e-06, "loss": 0.6838, "step": 13610 }, { "epoch": 2.5635234330886503, "grad_norm": 34.626930236816406, "learning_rate": 9.745906267645399e-06, "loss": 0.634, "step": 13620 }, { "epoch": 2.5654056088838697, "grad_norm": 11.463188171386719, "learning_rate": 9.738377564464521e-06, "loss": 1.0214, "step": 13630 }, { "epoch": 2.567287784679089, "grad_norm": 9.072608947753906, "learning_rate": 9.730848861283644e-06, "loss": 0.7319, "step": 13640 }, { "epoch": 2.5691699604743086, "grad_norm": 22.94637107849121, "learning_rate": 9.723320158102767e-06, "loss": 0.68, "step": 13650 }, { "epoch": 2.5710521362695276, "grad_norm": 5.962002754211426, "learning_rate": 9.71579145492189e-06, "loss": 0.8871, "step": 13660 }, { "epoch": 2.572934312064747, "grad_norm": 16.218671798706055, "learning_rate": 9.708262751741013e-06, "loss": 0.88, "step": 13670 }, { "epoch": 2.574816487859966, "grad_norm": 7.1879777908325195, "learning_rate": 9.700734048560136e-06, "loss": 1.2115, "step": 13680 }, { "epoch": 2.5766986636551854, "grad_norm": 7.769774913787842, "learning_rate": 9.693205345379259e-06, "loss": 0.4464, "step": 13690 }, { "epoch": 2.578580839450405, "grad_norm": 31.54520606994629, "learning_rate": 9.685676642198382e-06, "loss": 1.1748, "step": 13700 }, { "epoch": 2.580463015245624, "grad_norm": 36.15426254272461, "learning_rate": 9.678147939017505e-06, "loss": 0.8529, "step": 13710 }, { "epoch": 2.5823451910408433, "grad_norm": 19.819948196411133, "learning_rate": 9.670619235836628e-06, "loss": 0.658, "step": 13720 }, { "epoch": 2.5842273668360622, "grad_norm": 38.99443435668945, "learning_rate": 9.66309053265575e-06, "loss": 1.0109, "step": 13730 }, { "epoch": 2.5861095426312817, "grad_norm": 38.460819244384766, "learning_rate": 9.655561829474874e-06, "loss": 0.9315, "step": 13740 }, { "epoch": 2.587991718426501, "grad_norm": 30.626075744628906, "learning_rate": 9.648033126293997e-06, "loss": 0.9099, "step": 13750 }, { "epoch": 2.5898738942217205, "grad_norm": 2.2698826789855957, "learning_rate": 9.64050442311312e-06, "loss": 0.357, "step": 13760 }, { "epoch": 2.5917560700169395, "grad_norm": 13.714137077331543, "learning_rate": 9.632975719932242e-06, "loss": 0.8274, "step": 13770 }, { "epoch": 2.593638245812159, "grad_norm": 13.168296813964844, "learning_rate": 9.625447016751365e-06, "loss": 0.6799, "step": 13780 }, { "epoch": 2.595520421607378, "grad_norm": 22.481964111328125, "learning_rate": 9.617918313570488e-06, "loss": 0.8352, "step": 13790 }, { "epoch": 2.5974025974025974, "grad_norm": 2.651874303817749, "learning_rate": 9.610389610389611e-06, "loss": 0.5312, "step": 13800 }, { "epoch": 2.599284773197817, "grad_norm": 24.131620407104492, "learning_rate": 9.602860907208734e-06, "loss": 1.0113, "step": 13810 }, { "epoch": 2.601166948993036, "grad_norm": 28.216411590576172, "learning_rate": 9.595332204027857e-06, "loss": 0.9916, "step": 13820 }, { "epoch": 2.6030491247882552, "grad_norm": 33.7442626953125, "learning_rate": 9.58780350084698e-06, "loss": 1.0541, "step": 13830 }, { "epoch": 2.6049313005834747, "grad_norm": 26.970806121826172, "learning_rate": 9.580274797666103e-06, "loss": 0.5681, "step": 13840 }, { "epoch": 2.6068134763786937, "grad_norm": 4.043606758117676, "learning_rate": 9.572746094485226e-06, "loss": 0.5892, "step": 13850 }, { "epoch": 2.608695652173913, "grad_norm": 1.2837311029434204, "learning_rate": 9.565217391304349e-06, "loss": 0.4397, "step": 13860 }, { "epoch": 2.6105778279691325, "grad_norm": 12.38525104522705, "learning_rate": 9.557688688123472e-06, "loss": 0.6613, "step": 13870 }, { "epoch": 2.6124600037643515, "grad_norm": 23.795654296875, "learning_rate": 9.550159984942595e-06, "loss": 0.6335, "step": 13880 }, { "epoch": 2.614342179559571, "grad_norm": 28.89879035949707, "learning_rate": 9.542631281761717e-06, "loss": 0.7167, "step": 13890 }, { "epoch": 2.61622435535479, "grad_norm": 8.0195894241333, "learning_rate": 9.53510257858084e-06, "loss": 1.1297, "step": 13900 }, { "epoch": 2.6181065311500094, "grad_norm": 10.090495109558105, "learning_rate": 9.527573875399963e-06, "loss": 0.858, "step": 13910 }, { "epoch": 2.619988706945229, "grad_norm": 24.793590545654297, "learning_rate": 9.520045172219086e-06, "loss": 0.8893, "step": 13920 }, { "epoch": 2.621870882740448, "grad_norm": 3.7464537620544434, "learning_rate": 9.512516469038209e-06, "loss": 0.5312, "step": 13930 }, { "epoch": 2.623753058535667, "grad_norm": 11.590424537658691, "learning_rate": 9.504987765857332e-06, "loss": 0.7581, "step": 13940 }, { "epoch": 2.6256352343308866, "grad_norm": 23.143346786499023, "learning_rate": 9.497459062676455e-06, "loss": 0.581, "step": 13950 }, { "epoch": 2.6275174101261056, "grad_norm": 5.5682477951049805, "learning_rate": 9.489930359495578e-06, "loss": 0.9783, "step": 13960 }, { "epoch": 2.629399585921325, "grad_norm": 14.035380363464355, "learning_rate": 9.4824016563147e-06, "loss": 0.5775, "step": 13970 }, { "epoch": 2.6312817617165445, "grad_norm": 28.95306968688965, "learning_rate": 9.474872953133824e-06, "loss": 0.5147, "step": 13980 }, { "epoch": 2.6331639375117635, "grad_norm": 14.128450393676758, "learning_rate": 9.467344249952947e-06, "loss": 1.1285, "step": 13990 }, { "epoch": 2.635046113306983, "grad_norm": 23.115079879760742, "learning_rate": 9.45981554677207e-06, "loss": 0.9068, "step": 14000 }, { "epoch": 2.636928289102202, "grad_norm": 4.194851875305176, "learning_rate": 9.452286843591193e-06, "loss": 0.7157, "step": 14010 }, { "epoch": 2.6388104648974213, "grad_norm": 6.765182018280029, "learning_rate": 9.444758140410315e-06, "loss": 0.6346, "step": 14020 }, { "epoch": 2.6406926406926408, "grad_norm": 5.432229518890381, "learning_rate": 9.437229437229438e-06, "loss": 0.5878, "step": 14030 }, { "epoch": 2.64257481648786, "grad_norm": 27.054845809936523, "learning_rate": 9.429700734048561e-06, "loss": 0.517, "step": 14040 }, { "epoch": 2.644456992283079, "grad_norm": 9.652582168579102, "learning_rate": 9.422172030867684e-06, "loss": 0.6961, "step": 14050 }, { "epoch": 2.6463391680782986, "grad_norm": 2.1461129188537598, "learning_rate": 9.414643327686807e-06, "loss": 0.7967, "step": 14060 }, { "epoch": 2.6482213438735176, "grad_norm": 61.811004638671875, "learning_rate": 9.40711462450593e-06, "loss": 0.4514, "step": 14070 }, { "epoch": 2.650103519668737, "grad_norm": 5.977787017822266, "learning_rate": 9.399585921325053e-06, "loss": 0.6916, "step": 14080 }, { "epoch": 2.6519856954639565, "grad_norm": 1.1817024946212769, "learning_rate": 9.392057218144174e-06, "loss": 0.4419, "step": 14090 }, { "epoch": 2.6538678712591754, "grad_norm": 1.6775157451629639, "learning_rate": 9.384528514963299e-06, "loss": 0.7471, "step": 14100 }, { "epoch": 2.655750047054395, "grad_norm": 10.702045440673828, "learning_rate": 9.376999811782422e-06, "loss": 0.727, "step": 14110 }, { "epoch": 2.6576322228496143, "grad_norm": 26.477373123168945, "learning_rate": 9.369471108601545e-06, "loss": 1.0107, "step": 14120 }, { "epoch": 2.6595143986448333, "grad_norm": 7.560026168823242, "learning_rate": 9.361942405420668e-06, "loss": 0.7974, "step": 14130 }, { "epoch": 2.6613965744400527, "grad_norm": 29.21369743347168, "learning_rate": 9.354413702239789e-06, "loss": 0.8084, "step": 14140 }, { "epoch": 2.663278750235272, "grad_norm": 19.456628799438477, "learning_rate": 9.346884999058913e-06, "loss": 0.9577, "step": 14150 }, { "epoch": 2.665160926030491, "grad_norm": 18.852127075195312, "learning_rate": 9.339356295878036e-06, "loss": 0.6632, "step": 14160 }, { "epoch": 2.6670431018257106, "grad_norm": 2.706190824508667, "learning_rate": 9.33182759269716e-06, "loss": 0.5317, "step": 14170 }, { "epoch": 2.6689252776209296, "grad_norm": 11.295425415039062, "learning_rate": 9.32429888951628e-06, "loss": 1.0268, "step": 14180 }, { "epoch": 2.670807453416149, "grad_norm": 16.14836311340332, "learning_rate": 9.316770186335405e-06, "loss": 0.6663, "step": 14190 }, { "epoch": 2.6726896292113684, "grad_norm": 11.459263801574707, "learning_rate": 9.309241483154528e-06, "loss": 0.7522, "step": 14200 }, { "epoch": 2.674571805006588, "grad_norm": 15.684527397155762, "learning_rate": 9.301712779973651e-06, "loss": 1.1521, "step": 14210 }, { "epoch": 2.676453980801807, "grad_norm": 5.818396091461182, "learning_rate": 9.294184076792772e-06, "loss": 0.8579, "step": 14220 }, { "epoch": 2.6783361565970263, "grad_norm": 4.9605326652526855, "learning_rate": 9.286655373611895e-06, "loss": 0.4771, "step": 14230 }, { "epoch": 2.6802183323922453, "grad_norm": 3.3847286701202393, "learning_rate": 9.27912667043102e-06, "loss": 0.5427, "step": 14240 }, { "epoch": 2.6821005081874647, "grad_norm": 8.891020774841309, "learning_rate": 9.271597967250143e-06, "loss": 0.6936, "step": 14250 }, { "epoch": 2.683982683982684, "grad_norm": 17.484031677246094, "learning_rate": 9.264069264069266e-06, "loss": 0.9668, "step": 14260 }, { "epoch": 2.685864859777903, "grad_norm": 11.090484619140625, "learning_rate": 9.256540560888387e-06, "loss": 1.2104, "step": 14270 }, { "epoch": 2.6877470355731226, "grad_norm": 17.802000045776367, "learning_rate": 9.24901185770751e-06, "loss": 0.8714, "step": 14280 }, { "epoch": 2.6896292113683415, "grad_norm": 12.713444709777832, "learning_rate": 9.241483154526634e-06, "loss": 0.6936, "step": 14290 }, { "epoch": 2.691511387163561, "grad_norm": 15.754109382629395, "learning_rate": 9.233954451345757e-06, "loss": 0.8509, "step": 14300 }, { "epoch": 2.6933935629587804, "grad_norm": 25.97137451171875, "learning_rate": 9.226425748164878e-06, "loss": 0.6812, "step": 14310 }, { "epoch": 2.695275738754, "grad_norm": 5.161522388458252, "learning_rate": 9.218897044984001e-06, "loss": 0.7603, "step": 14320 }, { "epoch": 2.697157914549219, "grad_norm": 9.959467887878418, "learning_rate": 9.211368341803126e-06, "loss": 0.4748, "step": 14330 }, { "epoch": 2.6990400903444383, "grad_norm": 5.075318336486816, "learning_rate": 9.203839638622249e-06, "loss": 0.7132, "step": 14340 }, { "epoch": 2.7009222661396572, "grad_norm": 47.152984619140625, "learning_rate": 9.19631093544137e-06, "loss": 0.9679, "step": 14350 }, { "epoch": 2.7028044419348767, "grad_norm": 5.405478000640869, "learning_rate": 9.188782232260493e-06, "loss": 0.6459, "step": 14360 }, { "epoch": 2.704686617730096, "grad_norm": 8.182209014892578, "learning_rate": 9.181253529079616e-06, "loss": 0.7378, "step": 14370 }, { "epoch": 2.7065687935253155, "grad_norm": 16.320960998535156, "learning_rate": 9.17372482589874e-06, "loss": 0.9392, "step": 14380 }, { "epoch": 2.7084509693205345, "grad_norm": 26.094823837280273, "learning_rate": 9.166196122717864e-06, "loss": 0.5237, "step": 14390 }, { "epoch": 2.710333145115754, "grad_norm": 1.0742714405059814, "learning_rate": 9.158667419536985e-06, "loss": 0.7379, "step": 14400 }, { "epoch": 2.712215320910973, "grad_norm": 19.00723648071289, "learning_rate": 9.151138716356108e-06, "loss": 0.7913, "step": 14410 }, { "epoch": 2.7140974967061924, "grad_norm": 14.363834381103516, "learning_rate": 9.143610013175232e-06, "loss": 0.9245, "step": 14420 }, { "epoch": 2.715979672501412, "grad_norm": 20.376310348510742, "learning_rate": 9.136081309994355e-06, "loss": 0.749, "step": 14430 }, { "epoch": 2.717861848296631, "grad_norm": 0.6434114575386047, "learning_rate": 9.128552606813476e-06, "loss": 0.7697, "step": 14440 }, { "epoch": 2.7197440240918502, "grad_norm": 12.685359001159668, "learning_rate": 9.1210239036326e-06, "loss": 0.6835, "step": 14450 }, { "epoch": 2.721626199887069, "grad_norm": 2.339576005935669, "learning_rate": 9.113495200451722e-06, "loss": 0.7144, "step": 14460 }, { "epoch": 2.7235083756822887, "grad_norm": 10.208086013793945, "learning_rate": 9.105966497270847e-06, "loss": 0.7922, "step": 14470 }, { "epoch": 2.725390551477508, "grad_norm": 12.116181373596191, "learning_rate": 9.09843779408997e-06, "loss": 0.5502, "step": 14480 }, { "epoch": 2.7272727272727275, "grad_norm": 18.400135040283203, "learning_rate": 9.090909090909091e-06, "loss": 0.9157, "step": 14490 }, { "epoch": 2.7291549030679465, "grad_norm": 20.76301383972168, "learning_rate": 9.083380387728214e-06, "loss": 0.8151, "step": 14500 }, { "epoch": 2.731037078863166, "grad_norm": 4.120022296905518, "learning_rate": 9.075851684547337e-06, "loss": 0.6814, "step": 14510 }, { "epoch": 2.732919254658385, "grad_norm": 10.123649597167969, "learning_rate": 9.068322981366461e-06, "loss": 0.7769, "step": 14520 }, { "epoch": 2.7348014304536044, "grad_norm": 16.534719467163086, "learning_rate": 9.060794278185583e-06, "loss": 0.8379, "step": 14530 }, { "epoch": 2.736683606248824, "grad_norm": 18.45408058166504, "learning_rate": 9.053265575004706e-06, "loss": 0.5487, "step": 14540 }, { "epoch": 2.7385657820440428, "grad_norm": 4.100691318511963, "learning_rate": 9.045736871823829e-06, "loss": 0.7074, "step": 14550 }, { "epoch": 2.740447957839262, "grad_norm": 27.86737632751465, "learning_rate": 9.038208168642953e-06, "loss": 0.9491, "step": 14560 }, { "epoch": 2.742330133634481, "grad_norm": 10.956998825073242, "learning_rate": 9.030679465462074e-06, "loss": 0.9141, "step": 14570 }, { "epoch": 2.7442123094297006, "grad_norm": 24.600666046142578, "learning_rate": 9.023150762281197e-06, "loss": 0.9744, "step": 14580 }, { "epoch": 2.74609448522492, "grad_norm": 5.019098281860352, "learning_rate": 9.01562205910032e-06, "loss": 0.7633, "step": 14590 }, { "epoch": 2.7479766610201395, "grad_norm": 15.814457893371582, "learning_rate": 9.008093355919443e-06, "loss": 0.6359, "step": 14600 }, { "epoch": 2.7498588368153585, "grad_norm": 14.39587116241455, "learning_rate": 9.000564652738568e-06, "loss": 0.7094, "step": 14610 }, { "epoch": 2.751741012610578, "grad_norm": 19.537870407104492, "learning_rate": 8.993035949557689e-06, "loss": 0.6722, "step": 14620 }, { "epoch": 2.753623188405797, "grad_norm": 22.65885353088379, "learning_rate": 8.985507246376812e-06, "loss": 0.8115, "step": 14630 }, { "epoch": 2.7555053642010163, "grad_norm": 3.483638048171997, "learning_rate": 8.977978543195935e-06, "loss": 0.5659, "step": 14640 }, { "epoch": 2.7573875399962358, "grad_norm": 20.589828491210938, "learning_rate": 8.970449840015058e-06, "loss": 0.8191, "step": 14650 }, { "epoch": 2.759269715791455, "grad_norm": 22.62665557861328, "learning_rate": 8.96292113683418e-06, "loss": 0.7916, "step": 14660 }, { "epoch": 2.761151891586674, "grad_norm": 21.46197509765625, "learning_rate": 8.955392433653304e-06, "loss": 0.8109, "step": 14670 }, { "epoch": 2.7630340673818936, "grad_norm": 32.4882698059082, "learning_rate": 8.947863730472427e-06, "loss": 1.0646, "step": 14680 }, { "epoch": 2.7649162431771126, "grad_norm": 7.668648719787598, "learning_rate": 8.94033502729155e-06, "loss": 0.6514, "step": 14690 }, { "epoch": 2.766798418972332, "grad_norm": 4.4062581062316895, "learning_rate": 8.932806324110672e-06, "loss": 0.7437, "step": 14700 }, { "epoch": 2.7686805947675515, "grad_norm": 10.707076072692871, "learning_rate": 8.925277620929795e-06, "loss": 0.7874, "step": 14710 }, { "epoch": 2.7705627705627704, "grad_norm": 10.452109336853027, "learning_rate": 8.917748917748918e-06, "loss": 0.5787, "step": 14720 }, { "epoch": 2.77244494635799, "grad_norm": 5.401888370513916, "learning_rate": 8.910220214568041e-06, "loss": 0.824, "step": 14730 }, { "epoch": 2.774327122153209, "grad_norm": 7.998766899108887, "learning_rate": 8.902691511387164e-06, "loss": 0.7369, "step": 14740 }, { "epoch": 2.7762092979484283, "grad_norm": 18.264177322387695, "learning_rate": 8.895162808206287e-06, "loss": 0.6624, "step": 14750 }, { "epoch": 2.7780914737436477, "grad_norm": 2.7713396549224854, "learning_rate": 8.88763410502541e-06, "loss": 0.7402, "step": 14760 }, { "epoch": 2.779973649538867, "grad_norm": 6.356698989868164, "learning_rate": 8.880105401844533e-06, "loss": 0.5583, "step": 14770 }, { "epoch": 2.781855825334086, "grad_norm": 15.435123443603516, "learning_rate": 8.872576698663656e-06, "loss": 0.8285, "step": 14780 }, { "epoch": 2.7837380011293056, "grad_norm": 18.149913787841797, "learning_rate": 8.865047995482779e-06, "loss": 1.1242, "step": 14790 }, { "epoch": 2.7856201769245246, "grad_norm": 37.68449783325195, "learning_rate": 8.857519292301902e-06, "loss": 0.7911, "step": 14800 }, { "epoch": 2.787502352719744, "grad_norm": 12.546880722045898, "learning_rate": 8.849990589121025e-06, "loss": 0.7478, "step": 14810 }, { "epoch": 2.7893845285149634, "grad_norm": 13.983786582946777, "learning_rate": 8.842461885940147e-06, "loss": 0.4683, "step": 14820 }, { "epoch": 2.7912667043101824, "grad_norm": 13.797372817993164, "learning_rate": 8.83493318275927e-06, "loss": 0.8398, "step": 14830 }, { "epoch": 2.793148880105402, "grad_norm": 14.855267524719238, "learning_rate": 8.827404479578393e-06, "loss": 0.9749, "step": 14840 }, { "epoch": 2.795031055900621, "grad_norm": 7.507328987121582, "learning_rate": 8.819875776397516e-06, "loss": 0.7643, "step": 14850 }, { "epoch": 2.7969132316958403, "grad_norm": 6.816436290740967, "learning_rate": 8.812347073216639e-06, "loss": 0.7515, "step": 14860 }, { "epoch": 2.7987954074910597, "grad_norm": 0.9153765439987183, "learning_rate": 8.804818370035762e-06, "loss": 0.4428, "step": 14870 }, { "epoch": 2.800677583286279, "grad_norm": 6.6985673904418945, "learning_rate": 8.797289666854885e-06, "loss": 0.8417, "step": 14880 }, { "epoch": 2.802559759081498, "grad_norm": 8.799405097961426, "learning_rate": 8.789760963674008e-06, "loss": 0.7036, "step": 14890 }, { "epoch": 2.8044419348767176, "grad_norm": 11.702898025512695, "learning_rate": 8.78223226049313e-06, "loss": 0.6547, "step": 14900 }, { "epoch": 2.8063241106719365, "grad_norm": 8.840179443359375, "learning_rate": 8.774703557312254e-06, "loss": 0.8857, "step": 14910 }, { "epoch": 2.808206286467156, "grad_norm": 17.277437210083008, "learning_rate": 8.767174854131377e-06, "loss": 0.9799, "step": 14920 }, { "epoch": 2.8100884622623754, "grad_norm": 40.69013977050781, "learning_rate": 8.7596461509505e-06, "loss": 0.8676, "step": 14930 }, { "epoch": 2.811970638057595, "grad_norm": 2.312723398208618, "learning_rate": 8.752117447769623e-06, "loss": 0.4275, "step": 14940 }, { "epoch": 2.813852813852814, "grad_norm": 30.11204719543457, "learning_rate": 8.744588744588745e-06, "loss": 0.8106, "step": 14950 }, { "epoch": 2.8157349896480333, "grad_norm": 10.502528190612793, "learning_rate": 8.737060041407868e-06, "loss": 0.8292, "step": 14960 }, { "epoch": 2.8176171654432522, "grad_norm": 20.59739875793457, "learning_rate": 8.729531338226991e-06, "loss": 0.5314, "step": 14970 }, { "epoch": 2.8194993412384717, "grad_norm": 8.57652473449707, "learning_rate": 8.722002635046114e-06, "loss": 0.7936, "step": 14980 }, { "epoch": 2.821381517033691, "grad_norm": 8.064692497253418, "learning_rate": 8.714473931865237e-06, "loss": 0.536, "step": 14990 }, { "epoch": 2.82326369282891, "grad_norm": 9.103423118591309, "learning_rate": 8.70694522868436e-06, "loss": 0.9126, "step": 15000 }, { "epoch": 2.8251458686241295, "grad_norm": 25.442306518554688, "learning_rate": 8.699416525503483e-06, "loss": 0.6525, "step": 15010 }, { "epoch": 2.8270280444193485, "grad_norm": 17.37743377685547, "learning_rate": 8.691887822322606e-06, "loss": 0.4816, "step": 15020 }, { "epoch": 2.828910220214568, "grad_norm": 22.00581932067871, "learning_rate": 8.684359119141729e-06, "loss": 0.6523, "step": 15030 }, { "epoch": 2.8307923960097874, "grad_norm": 13.935546875, "learning_rate": 8.676830415960852e-06, "loss": 0.5463, "step": 15040 }, { "epoch": 2.832674571805007, "grad_norm": 7.106715679168701, "learning_rate": 8.669301712779975e-06, "loss": 0.7296, "step": 15050 }, { "epoch": 2.834556747600226, "grad_norm": 8.322189331054688, "learning_rate": 8.661773009599098e-06, "loss": 0.7273, "step": 15060 }, { "epoch": 2.8364389233954452, "grad_norm": 15.4451322555542, "learning_rate": 8.65424430641822e-06, "loss": 0.8192, "step": 15070 }, { "epoch": 2.838321099190664, "grad_norm": 24.261255264282227, "learning_rate": 8.646715603237343e-06, "loss": 0.4825, "step": 15080 }, { "epoch": 2.8402032749858837, "grad_norm": 6.4923095703125, "learning_rate": 8.639186900056465e-06, "loss": 0.6175, "step": 15090 }, { "epoch": 2.842085450781103, "grad_norm": 7.1945085525512695, "learning_rate": 8.63165819687559e-06, "loss": 0.8712, "step": 15100 }, { "epoch": 2.843967626576322, "grad_norm": 9.958353996276855, "learning_rate": 8.624129493694712e-06, "loss": 0.8539, "step": 15110 }, { "epoch": 2.8458498023715415, "grad_norm": 4.765786647796631, "learning_rate": 8.616600790513835e-06, "loss": 0.6327, "step": 15120 }, { "epoch": 2.847731978166761, "grad_norm": 0.4407392740249634, "learning_rate": 8.609072087332958e-06, "loss": 0.6043, "step": 15130 }, { "epoch": 2.84961415396198, "grad_norm": 12.082877159118652, "learning_rate": 8.601543384152081e-06, "loss": 0.7255, "step": 15140 }, { "epoch": 2.8514963297571994, "grad_norm": 11.160933494567871, "learning_rate": 8.594014680971204e-06, "loss": 0.5107, "step": 15150 }, { "epoch": 2.853378505552419, "grad_norm": 7.909982681274414, "learning_rate": 8.586485977790327e-06, "loss": 0.7659, "step": 15160 }, { "epoch": 2.8552606813476378, "grad_norm": 15.593622207641602, "learning_rate": 8.57895727460945e-06, "loss": 0.6901, "step": 15170 }, { "epoch": 2.857142857142857, "grad_norm": 31.364673614501953, "learning_rate": 8.571428571428571e-06, "loss": 0.5985, "step": 15180 }, { "epoch": 2.859025032938076, "grad_norm": 26.913175582885742, "learning_rate": 8.563899868247696e-06, "loss": 0.8037, "step": 15190 }, { "epoch": 2.8609072087332956, "grad_norm": 15.38956069946289, "learning_rate": 8.556371165066818e-06, "loss": 0.7523, "step": 15200 }, { "epoch": 2.862789384528515, "grad_norm": 22.167905807495117, "learning_rate": 8.548842461885941e-06, "loss": 1.1415, "step": 15210 }, { "epoch": 2.8646715603237345, "grad_norm": 5.6959075927734375, "learning_rate": 8.541313758705064e-06, "loss": 0.7276, "step": 15220 }, { "epoch": 2.8665537361189535, "grad_norm": 15.122590065002441, "learning_rate": 8.533785055524186e-06, "loss": 0.7565, "step": 15230 }, { "epoch": 2.868435911914173, "grad_norm": 5.523094177246094, "learning_rate": 8.52625635234331e-06, "loss": 0.7688, "step": 15240 }, { "epoch": 2.870318087709392, "grad_norm": 13.523476600646973, "learning_rate": 8.518727649162433e-06, "loss": 0.8942, "step": 15250 }, { "epoch": 2.8722002635046113, "grad_norm": 13.976105690002441, "learning_rate": 8.511198945981556e-06, "loss": 0.7804, "step": 15260 }, { "epoch": 2.8740824392998308, "grad_norm": 23.4466552734375, "learning_rate": 8.503670242800677e-06, "loss": 0.7666, "step": 15270 }, { "epoch": 2.8759646150950497, "grad_norm": 20.092809677124023, "learning_rate": 8.496141539619802e-06, "loss": 0.9101, "step": 15280 }, { "epoch": 2.877846790890269, "grad_norm": 2.974151849746704, "learning_rate": 8.488612836438925e-06, "loss": 0.4844, "step": 15290 }, { "epoch": 2.879728966685488, "grad_norm": 24.60965919494629, "learning_rate": 8.481084133258048e-06, "loss": 0.6742, "step": 15300 }, { "epoch": 2.8816111424807076, "grad_norm": 6.7468485832214355, "learning_rate": 8.473555430077169e-06, "loss": 0.8118, "step": 15310 }, { "epoch": 2.883493318275927, "grad_norm": 16.004629135131836, "learning_rate": 8.466026726896292e-06, "loss": 0.8272, "step": 15320 }, { "epoch": 2.8853754940711465, "grad_norm": 9.687468528747559, "learning_rate": 8.458498023715416e-06, "loss": 0.6065, "step": 15330 }, { "epoch": 2.8872576698663655, "grad_norm": 18.13496971130371, "learning_rate": 8.45096932053454e-06, "loss": 0.7087, "step": 15340 }, { "epoch": 2.889139845661585, "grad_norm": 5.200069427490234, "learning_rate": 8.443440617353662e-06, "loss": 0.4101, "step": 15350 }, { "epoch": 2.891022021456804, "grad_norm": 18.469839096069336, "learning_rate": 8.435911914172784e-06, "loss": 0.4233, "step": 15360 }, { "epoch": 2.8929041972520233, "grad_norm": 27.56670379638672, "learning_rate": 8.428383210991906e-06, "loss": 0.9112, "step": 15370 }, { "epoch": 2.8947863730472427, "grad_norm": 18.819438934326172, "learning_rate": 8.420854507811031e-06, "loss": 0.92, "step": 15380 }, { "epoch": 2.8966685488424617, "grad_norm": 12.235003471374512, "learning_rate": 8.413325804630154e-06, "loss": 0.9199, "step": 15390 }, { "epoch": 2.898550724637681, "grad_norm": 12.527748107910156, "learning_rate": 8.405797101449275e-06, "loss": 0.6267, "step": 15400 }, { "epoch": 2.9004329004329006, "grad_norm": 20.517087936401367, "learning_rate": 8.398268398268398e-06, "loss": 0.9461, "step": 15410 }, { "epoch": 2.9023150762281196, "grad_norm": 7.900396347045898, "learning_rate": 8.390739695087523e-06, "loss": 0.5878, "step": 15420 }, { "epoch": 2.904197252023339, "grad_norm": 12.748804092407227, "learning_rate": 8.383210991906646e-06, "loss": 0.6466, "step": 15430 }, { "epoch": 2.9060794278185584, "grad_norm": 15.995560646057129, "learning_rate": 8.375682288725767e-06, "loss": 0.5473, "step": 15440 }, { "epoch": 2.9079616036137774, "grad_norm": 25.992582321166992, "learning_rate": 8.36815358554489e-06, "loss": 0.9877, "step": 15450 }, { "epoch": 2.909843779408997, "grad_norm": 3.78088641166687, "learning_rate": 8.360624882364013e-06, "loss": 0.4233, "step": 15460 }, { "epoch": 2.911725955204216, "grad_norm": 10.031113624572754, "learning_rate": 8.353096179183137e-06, "loss": 0.5838, "step": 15470 }, { "epoch": 2.9136081309994353, "grad_norm": 2.0009913444519043, "learning_rate": 8.34556747600226e-06, "loss": 0.975, "step": 15480 }, { "epoch": 2.9154903067946547, "grad_norm": 7.168464660644531, "learning_rate": 8.338038772821381e-06, "loss": 0.6154, "step": 15490 }, { "epoch": 2.917372482589874, "grad_norm": 18.54973793029785, "learning_rate": 8.330510069640504e-06, "loss": 0.974, "step": 15500 }, { "epoch": 2.919254658385093, "grad_norm": 1.8958022594451904, "learning_rate": 8.322981366459629e-06, "loss": 0.57, "step": 15510 }, { "epoch": 2.9211368341803126, "grad_norm": 10.597874641418457, "learning_rate": 8.315452663278752e-06, "loss": 0.6006, "step": 15520 }, { "epoch": 2.9230190099755315, "grad_norm": 15.698101043701172, "learning_rate": 8.307923960097873e-06, "loss": 0.5481, "step": 15530 }, { "epoch": 2.924901185770751, "grad_norm": 5.330072402954102, "learning_rate": 8.300395256916996e-06, "loss": 0.7784, "step": 15540 }, { "epoch": 2.9267833615659704, "grad_norm": 7.590254783630371, "learning_rate": 8.292866553736119e-06, "loss": 1.1277, "step": 15550 }, { "epoch": 2.9286655373611894, "grad_norm": 12.026838302612305, "learning_rate": 8.285337850555244e-06, "loss": 0.661, "step": 15560 }, { "epoch": 2.930547713156409, "grad_norm": 1.9510034322738647, "learning_rate": 8.277809147374365e-06, "loss": 0.7917, "step": 15570 }, { "epoch": 2.932429888951628, "grad_norm": 24.171648025512695, "learning_rate": 8.270280444193488e-06, "loss": 0.6675, "step": 15580 }, { "epoch": 2.9343120647468472, "grad_norm": 19.12851333618164, "learning_rate": 8.26275174101261e-06, "loss": 0.6863, "step": 15590 }, { "epoch": 2.9361942405420667, "grad_norm": 49.385536193847656, "learning_rate": 8.255223037831734e-06, "loss": 1.0035, "step": 15600 }, { "epoch": 2.938076416337286, "grad_norm": 13.706058502197266, "learning_rate": 8.247694334650858e-06, "loss": 0.9184, "step": 15610 }, { "epoch": 2.939958592132505, "grad_norm": 14.227673530578613, "learning_rate": 8.24016563146998e-06, "loss": 0.8337, "step": 15620 }, { "epoch": 2.9418407679277245, "grad_norm": 11.792285919189453, "learning_rate": 8.232636928289102e-06, "loss": 0.7102, "step": 15630 }, { "epoch": 2.9437229437229435, "grad_norm": 11.827733039855957, "learning_rate": 8.225108225108225e-06, "loss": 1.0902, "step": 15640 }, { "epoch": 2.945605119518163, "grad_norm": 22.433387756347656, "learning_rate": 8.21757952192735e-06, "loss": 0.6356, "step": 15650 }, { "epoch": 2.9474872953133824, "grad_norm": 2.886334180831909, "learning_rate": 8.210050818746471e-06, "loss": 0.6147, "step": 15660 }, { "epoch": 2.949369471108602, "grad_norm": 20.091821670532227, "learning_rate": 8.202522115565594e-06, "loss": 0.8424, "step": 15670 }, { "epoch": 2.951251646903821, "grad_norm": 11.964825630187988, "learning_rate": 8.194993412384717e-06, "loss": 0.8177, "step": 15680 }, { "epoch": 2.9531338226990402, "grad_norm": 24.54062271118164, "learning_rate": 8.18746470920384e-06, "loss": 1.1969, "step": 15690 }, { "epoch": 2.955015998494259, "grad_norm": 26.192880630493164, "learning_rate": 8.179936006022963e-06, "loss": 0.6056, "step": 15700 }, { "epoch": 2.9568981742894787, "grad_norm": 13.566141128540039, "learning_rate": 8.172407302842086e-06, "loss": 0.7419, "step": 15710 }, { "epoch": 2.958780350084698, "grad_norm": 37.28998565673828, "learning_rate": 8.164878599661209e-06, "loss": 0.5588, "step": 15720 }, { "epoch": 2.960662525879917, "grad_norm": 16.60234260559082, "learning_rate": 8.157349896480332e-06, "loss": 0.72, "step": 15730 }, { "epoch": 2.9625447016751365, "grad_norm": 24.26984977722168, "learning_rate": 8.149821193299455e-06, "loss": 0.7045, "step": 15740 }, { "epoch": 2.9644268774703555, "grad_norm": 6.713445663452148, "learning_rate": 8.142292490118577e-06, "loss": 0.5017, "step": 15750 }, { "epoch": 2.966309053265575, "grad_norm": 32.575897216796875, "learning_rate": 8.1347637869377e-06, "loss": 0.8543, "step": 15760 }, { "epoch": 2.9681912290607944, "grad_norm": 3.385469913482666, "learning_rate": 8.127235083756823e-06, "loss": 0.7149, "step": 15770 }, { "epoch": 2.970073404856014, "grad_norm": 29.050565719604492, "learning_rate": 8.119706380575946e-06, "loss": 1.0051, "step": 15780 }, { "epoch": 2.9719555806512328, "grad_norm": 2.618756055831909, "learning_rate": 8.112177677395069e-06, "loss": 0.6576, "step": 15790 }, { "epoch": 2.973837756446452, "grad_norm": 11.132329940795898, "learning_rate": 8.104648974214192e-06, "loss": 0.703, "step": 15800 }, { "epoch": 2.975719932241671, "grad_norm": 1.447586178779602, "learning_rate": 8.097120271033315e-06, "loss": 0.7784, "step": 15810 }, { "epoch": 2.9776021080368906, "grad_norm": 13.071996688842773, "learning_rate": 8.089591567852438e-06, "loss": 0.7363, "step": 15820 }, { "epoch": 2.97948428383211, "grad_norm": 8.223648071289062, "learning_rate": 8.08206286467156e-06, "loss": 0.7832, "step": 15830 }, { "epoch": 2.981366459627329, "grad_norm": 7.399938583374023, "learning_rate": 8.074534161490684e-06, "loss": 0.8243, "step": 15840 }, { "epoch": 2.9832486354225485, "grad_norm": 23.568254470825195, "learning_rate": 8.067005458309807e-06, "loss": 1.0995, "step": 15850 }, { "epoch": 2.9851308112177675, "grad_norm": 18.40585708618164, "learning_rate": 8.05947675512893e-06, "loss": 0.8484, "step": 15860 }, { "epoch": 2.987012987012987, "grad_norm": 7.398470401763916, "learning_rate": 8.051948051948052e-06, "loss": 0.7103, "step": 15870 }, { "epoch": 2.9888951628082063, "grad_norm": 41.91292190551758, "learning_rate": 8.044419348767175e-06, "loss": 0.376, "step": 15880 }, { "epoch": 2.9907773386034258, "grad_norm": 22.689844131469727, "learning_rate": 8.036890645586298e-06, "loss": 0.8109, "step": 15890 }, { "epoch": 2.9926595143986447, "grad_norm": 18.084108352661133, "learning_rate": 8.029361942405421e-06, "loss": 0.7224, "step": 15900 }, { "epoch": 2.994541690193864, "grad_norm": 20.86298179626465, "learning_rate": 8.021833239224544e-06, "loss": 0.7326, "step": 15910 }, { "epoch": 2.996423865989083, "grad_norm": 15.901738166809082, "learning_rate": 8.014304536043667e-06, "loss": 0.5458, "step": 15920 }, { "epoch": 2.9983060417843026, "grad_norm": 28.315481185913086, "learning_rate": 8.00677583286279e-06, "loss": 0.732, "step": 15930 }, { "epoch": 3.0, "eval_accuracy": 0.9138666666666667, "eval_loss": 0.3551448881626129, "eval_runtime": 124.8647, "eval_samples_per_second": 60.065, "eval_steps_per_second": 7.512, "step": 15939 }, { "epoch": 3.000188217579522, "grad_norm": 24.23802947998047, "learning_rate": 7.999247129681913e-06, "loss": 0.6504, "step": 15940 }, { "epoch": 3.002070393374741, "grad_norm": 23.428064346313477, "learning_rate": 7.991718426501036e-06, "loss": 0.8482, "step": 15950 }, { "epoch": 3.0039525691699605, "grad_norm": 17.107192993164062, "learning_rate": 7.984189723320159e-06, "loss": 0.8003, "step": 15960 }, { "epoch": 3.00583474496518, "grad_norm": 23.277639389038086, "learning_rate": 7.976661020139282e-06, "loss": 0.669, "step": 15970 }, { "epoch": 3.007716920760399, "grad_norm": 24.554271697998047, "learning_rate": 7.969132316958405e-06, "loss": 0.6349, "step": 15980 }, { "epoch": 3.0095990965556183, "grad_norm": 16.340686798095703, "learning_rate": 7.961603613777528e-06, "loss": 0.9792, "step": 15990 }, { "epoch": 3.0114812723508377, "grad_norm": 22.233644485473633, "learning_rate": 7.95407491059665e-06, "loss": 0.8113, "step": 16000 }, { "epoch": 3.0133634481460567, "grad_norm": 6.466579914093018, "learning_rate": 7.946546207415773e-06, "loss": 0.4841, "step": 16010 }, { "epoch": 3.015245623941276, "grad_norm": 7.007059574127197, "learning_rate": 7.939017504234896e-06, "loss": 0.9804, "step": 16020 }, { "epoch": 3.0171277997364956, "grad_norm": 9.863162994384766, "learning_rate": 7.93148880105402e-06, "loss": 0.5055, "step": 16030 }, { "epoch": 3.0190099755317146, "grad_norm": 27.062816619873047, "learning_rate": 7.923960097873142e-06, "loss": 0.6065, "step": 16040 }, { "epoch": 3.020892151326934, "grad_norm": 8.803095817565918, "learning_rate": 7.916431394692265e-06, "loss": 0.3608, "step": 16050 }, { "epoch": 3.022774327122153, "grad_norm": 2.467694044113159, "learning_rate": 7.908902691511388e-06, "loss": 0.7722, "step": 16060 }, { "epoch": 3.0246565029173724, "grad_norm": 8.088591575622559, "learning_rate": 7.901373988330511e-06, "loss": 0.5181, "step": 16070 }, { "epoch": 3.026538678712592, "grad_norm": 18.756155014038086, "learning_rate": 7.893845285149634e-06, "loss": 0.7348, "step": 16080 }, { "epoch": 3.028420854507811, "grad_norm": 16.53417205810547, "learning_rate": 7.886316581968757e-06, "loss": 0.4264, "step": 16090 }, { "epoch": 3.0303030303030303, "grad_norm": 6.082591533660889, "learning_rate": 7.87878787878788e-06, "loss": 0.6317, "step": 16100 }, { "epoch": 3.0321852060982497, "grad_norm": 29.989810943603516, "learning_rate": 7.871259175607003e-06, "loss": 0.4669, "step": 16110 }, { "epoch": 3.0340673818934687, "grad_norm": 9.16569995880127, "learning_rate": 7.863730472426126e-06, "loss": 0.6771, "step": 16120 }, { "epoch": 3.035949557688688, "grad_norm": 9.343409538269043, "learning_rate": 7.856201769245248e-06, "loss": 0.3368, "step": 16130 }, { "epoch": 3.0378317334839076, "grad_norm": 15.900422096252441, "learning_rate": 7.848673066064371e-06, "loss": 0.6708, "step": 16140 }, { "epoch": 3.0397139092791265, "grad_norm": 4.078092098236084, "learning_rate": 7.841144362883494e-06, "loss": 0.5507, "step": 16150 }, { "epoch": 3.041596085074346, "grad_norm": 1.3915058374404907, "learning_rate": 7.833615659702617e-06, "loss": 0.5978, "step": 16160 }, { "epoch": 3.0434782608695654, "grad_norm": 5.797540187835693, "learning_rate": 7.82608695652174e-06, "loss": 0.8542, "step": 16170 }, { "epoch": 3.0453604366647844, "grad_norm": 11.26925277709961, "learning_rate": 7.818558253340861e-06, "loss": 0.7604, "step": 16180 }, { "epoch": 3.047242612460004, "grad_norm": 11.07471752166748, "learning_rate": 7.811029550159986e-06, "loss": 0.8315, "step": 16190 }, { "epoch": 3.049124788255223, "grad_norm": 2.0462937355041504, "learning_rate": 7.803500846979109e-06, "loss": 0.7766, "step": 16200 }, { "epoch": 3.0510069640504422, "grad_norm": 14.226692199707031, "learning_rate": 7.795972143798232e-06, "loss": 0.9066, "step": 16210 }, { "epoch": 3.0528891398456617, "grad_norm": 19.502113342285156, "learning_rate": 7.788443440617355e-06, "loss": 0.8325, "step": 16220 }, { "epoch": 3.0547713156408807, "grad_norm": 9.413043022155762, "learning_rate": 7.780914737436478e-06, "loss": 0.5072, "step": 16230 }, { "epoch": 3.0566534914361, "grad_norm": 18.830753326416016, "learning_rate": 7.7733860342556e-06, "loss": 0.6074, "step": 16240 }, { "epoch": 3.0585356672313195, "grad_norm": 21.968721389770508, "learning_rate": 7.765857331074724e-06, "loss": 0.7097, "step": 16250 }, { "epoch": 3.0604178430265385, "grad_norm": 0.984065592288971, "learning_rate": 7.758328627893846e-06, "loss": 0.7093, "step": 16260 }, { "epoch": 3.062300018821758, "grad_norm": 1.9321556091308594, "learning_rate": 7.750799924712968e-06, "loss": 0.7335, "step": 16270 }, { "epoch": 3.0641821946169774, "grad_norm": 17.950807571411133, "learning_rate": 7.743271221532092e-06, "loss": 0.615, "step": 16280 }, { "epoch": 3.0660643704121964, "grad_norm": 10.872349739074707, "learning_rate": 7.735742518351215e-06, "loss": 0.7945, "step": 16290 }, { "epoch": 3.067946546207416, "grad_norm": 9.358575820922852, "learning_rate": 7.728213815170338e-06, "loss": 0.6757, "step": 16300 }, { "epoch": 3.0698287220026352, "grad_norm": 22.87818717956543, "learning_rate": 7.72068511198946e-06, "loss": 0.5002, "step": 16310 }, { "epoch": 3.0717108977978542, "grad_norm": 14.869016647338867, "learning_rate": 7.713156408808582e-06, "loss": 0.7588, "step": 16320 }, { "epoch": 3.0735930735930737, "grad_norm": 8.90829849243164, "learning_rate": 7.705627705627707e-06, "loss": 1.0797, "step": 16330 }, { "epoch": 3.075475249388293, "grad_norm": 27.344745635986328, "learning_rate": 7.69809900244683e-06, "loss": 0.5867, "step": 16340 }, { "epoch": 3.077357425183512, "grad_norm": 26.566848754882812, "learning_rate": 7.690570299265953e-06, "loss": 0.5187, "step": 16350 }, { "epoch": 3.0792396009787315, "grad_norm": 36.06024932861328, "learning_rate": 7.683041596085074e-06, "loss": 0.8335, "step": 16360 }, { "epoch": 3.0811217767739505, "grad_norm": 8.580551147460938, "learning_rate": 7.675512892904199e-06, "loss": 0.6761, "step": 16370 }, { "epoch": 3.08300395256917, "grad_norm": 36.79864501953125, "learning_rate": 7.667984189723321e-06, "loss": 0.8464, "step": 16380 }, { "epoch": 3.0848861283643894, "grad_norm": 75.77937316894531, "learning_rate": 7.660455486542444e-06, "loss": 0.7556, "step": 16390 }, { "epoch": 3.0867683041596083, "grad_norm": 0.26863107085227966, "learning_rate": 7.652926783361566e-06, "loss": 0.7862, "step": 16400 }, { "epoch": 3.0886504799548278, "grad_norm": 19.25002670288086, "learning_rate": 7.645398080180689e-06, "loss": 0.6988, "step": 16410 }, { "epoch": 3.090532655750047, "grad_norm": 5.37202262878418, "learning_rate": 7.637869376999813e-06, "loss": 0.7596, "step": 16420 }, { "epoch": 3.092414831545266, "grad_norm": 21.078012466430664, "learning_rate": 7.630340673818936e-06, "loss": 0.6614, "step": 16430 }, { "epoch": 3.0942970073404856, "grad_norm": 26.502809524536133, "learning_rate": 7.622811970638057e-06, "loss": 0.5694, "step": 16440 }, { "epoch": 3.096179183135705, "grad_norm": 3.8008010387420654, "learning_rate": 7.615283267457181e-06, "loss": 0.5919, "step": 16450 }, { "epoch": 3.098061358930924, "grad_norm": 2.185805320739746, "learning_rate": 7.607754564276304e-06, "loss": 1.0339, "step": 16460 }, { "epoch": 3.0999435347261435, "grad_norm": 1.0301812887191772, "learning_rate": 7.600225861095427e-06, "loss": 0.4373, "step": 16470 }, { "epoch": 3.1018257105213625, "grad_norm": 13.050724983215332, "learning_rate": 7.592697157914551e-06, "loss": 0.8136, "step": 16480 }, { "epoch": 3.103707886316582, "grad_norm": 17.977001190185547, "learning_rate": 7.585168454733673e-06, "loss": 0.5327, "step": 16490 }, { "epoch": 3.1055900621118013, "grad_norm": 14.840492248535156, "learning_rate": 7.577639751552796e-06, "loss": 0.5607, "step": 16500 }, { "epoch": 3.1074722379070203, "grad_norm": 9.116665840148926, "learning_rate": 7.570111048371919e-06, "loss": 0.7519, "step": 16510 }, { "epoch": 3.1093544137022397, "grad_norm": 1.0839731693267822, "learning_rate": 7.5625823451910415e-06, "loss": 0.726, "step": 16520 }, { "epoch": 3.111236589497459, "grad_norm": 11.97722339630127, "learning_rate": 7.555053642010164e-06, "loss": 0.681, "step": 16530 }, { "epoch": 3.113118765292678, "grad_norm": 28.19617462158203, "learning_rate": 7.547524938829287e-06, "loss": 0.5871, "step": 16540 }, { "epoch": 3.1150009410878976, "grad_norm": 31.05242347717285, "learning_rate": 7.53999623564841e-06, "loss": 0.5718, "step": 16550 }, { "epoch": 3.116883116883117, "grad_norm": 8.402994155883789, "learning_rate": 7.532467532467533e-06, "loss": 0.6193, "step": 16560 }, { "epoch": 3.118765292678336, "grad_norm": 10.946696281433105, "learning_rate": 7.524938829286656e-06, "loss": 0.5893, "step": 16570 }, { "epoch": 3.1206474684735555, "grad_norm": 29.695755004882812, "learning_rate": 7.517410126105778e-06, "loss": 0.8272, "step": 16580 }, { "epoch": 3.122529644268775, "grad_norm": 12.49916934967041, "learning_rate": 7.509881422924902e-06, "loss": 0.3927, "step": 16590 }, { "epoch": 3.124411820063994, "grad_norm": 22.52407455444336, "learning_rate": 7.502352719744025e-06, "loss": 0.8379, "step": 16600 }, { "epoch": 3.1262939958592133, "grad_norm": 20.72262191772461, "learning_rate": 7.494824016563148e-06, "loss": 0.8475, "step": 16610 }, { "epoch": 3.1281761716544327, "grad_norm": 18.68262481689453, "learning_rate": 7.48729531338227e-06, "loss": 0.5057, "step": 16620 }, { "epoch": 3.1300583474496517, "grad_norm": 21.154441833496094, "learning_rate": 7.479766610201394e-06, "loss": 0.6002, "step": 16630 }, { "epoch": 3.131940523244871, "grad_norm": 33.61641311645508, "learning_rate": 7.472237907020517e-06, "loss": 0.7546, "step": 16640 }, { "epoch": 3.13382269904009, "grad_norm": 54.20226287841797, "learning_rate": 7.4647092038396395e-06, "loss": 0.5867, "step": 16650 }, { "epoch": 3.1357048748353096, "grad_norm": 0.9356986284255981, "learning_rate": 7.457180500658762e-06, "loss": 0.6669, "step": 16660 }, { "epoch": 3.137587050630529, "grad_norm": 2.053644895553589, "learning_rate": 7.4496517974778845e-06, "loss": 0.6829, "step": 16670 }, { "epoch": 3.139469226425748, "grad_norm": 8.472742080688477, "learning_rate": 7.442123094297008e-06, "loss": 0.6525, "step": 16680 }, { "epoch": 3.1413514022209674, "grad_norm": 19.01861000061035, "learning_rate": 7.434594391116131e-06, "loss": 0.9197, "step": 16690 }, { "epoch": 3.143233578016187, "grad_norm": 100.38238525390625, "learning_rate": 7.427065687935254e-06, "loss": 0.6449, "step": 16700 }, { "epoch": 3.145115753811406, "grad_norm": 7.82566499710083, "learning_rate": 7.419536984754376e-06, "loss": 0.7607, "step": 16710 }, { "epoch": 3.1469979296066253, "grad_norm": 1.0181591510772705, "learning_rate": 7.412008281573499e-06, "loss": 0.7252, "step": 16720 }, { "epoch": 3.1488801054018447, "grad_norm": 3.0104870796203613, "learning_rate": 7.404479578392623e-06, "loss": 0.5994, "step": 16730 }, { "epoch": 3.1507622811970637, "grad_norm": 23.66948699951172, "learning_rate": 7.396950875211746e-06, "loss": 0.6255, "step": 16740 }, { "epoch": 3.152644456992283, "grad_norm": 10.450725555419922, "learning_rate": 7.389422172030868e-06, "loss": 0.8733, "step": 16750 }, { "epoch": 3.1545266327875026, "grad_norm": 17.46381378173828, "learning_rate": 7.381893468849991e-06, "loss": 0.572, "step": 16760 }, { "epoch": 3.1564088085827215, "grad_norm": 5.500278949737549, "learning_rate": 7.3743647656691146e-06, "loss": 0.8854, "step": 16770 }, { "epoch": 3.158290984377941, "grad_norm": 7.526846885681152, "learning_rate": 7.3668360624882375e-06, "loss": 0.9439, "step": 16780 }, { "epoch": 3.16017316017316, "grad_norm": 14.849970817565918, "learning_rate": 7.3593073593073596e-06, "loss": 0.9489, "step": 16790 }, { "epoch": 3.1620553359683794, "grad_norm": 23.20258140563965, "learning_rate": 7.3517786561264825e-06, "loss": 0.7435, "step": 16800 }, { "epoch": 3.163937511763599, "grad_norm": 11.882279396057129, "learning_rate": 7.344249952945605e-06, "loss": 0.69, "step": 16810 }, { "epoch": 3.165819687558818, "grad_norm": 17.517498016357422, "learning_rate": 7.336721249764729e-06, "loss": 0.5923, "step": 16820 }, { "epoch": 3.1677018633540373, "grad_norm": 11.301204681396484, "learning_rate": 7.329192546583852e-06, "loss": 0.447, "step": 16830 }, { "epoch": 3.1695840391492567, "grad_norm": 21.43952178955078, "learning_rate": 7.321663843402974e-06, "loss": 0.9853, "step": 16840 }, { "epoch": 3.1714662149444757, "grad_norm": 25.572917938232422, "learning_rate": 7.314135140222097e-06, "loss": 0.9242, "step": 16850 }, { "epoch": 3.173348390739695, "grad_norm": 1.663003921508789, "learning_rate": 7.30660643704122e-06, "loss": 0.4109, "step": 16860 }, { "epoch": 3.1752305665349145, "grad_norm": 32.387046813964844, "learning_rate": 7.299077733860344e-06, "loss": 0.9121, "step": 16870 }, { "epoch": 3.1771127423301335, "grad_norm": 7.099456787109375, "learning_rate": 7.291549030679466e-06, "loss": 0.5254, "step": 16880 }, { "epoch": 3.178994918125353, "grad_norm": 8.126106262207031, "learning_rate": 7.284020327498589e-06, "loss": 0.9133, "step": 16890 }, { "epoch": 3.1808770939205724, "grad_norm": 21.565263748168945, "learning_rate": 7.276491624317712e-06, "loss": 0.4913, "step": 16900 }, { "epoch": 3.1827592697157914, "grad_norm": 1.3021163940429688, "learning_rate": 7.2689629211368355e-06, "loss": 0.6226, "step": 16910 }, { "epoch": 3.184641445511011, "grad_norm": 28.11388397216797, "learning_rate": 7.2614342179559576e-06, "loss": 0.6418, "step": 16920 }, { "epoch": 3.18652362130623, "grad_norm": 21.68681526184082, "learning_rate": 7.2539055147750805e-06, "loss": 0.6138, "step": 16930 }, { "epoch": 3.1884057971014492, "grad_norm": 2.91939640045166, "learning_rate": 7.246376811594203e-06, "loss": 0.6506, "step": 16940 }, { "epoch": 3.1902879728966687, "grad_norm": 16.150680541992188, "learning_rate": 7.238848108413326e-06, "loss": 0.8231, "step": 16950 }, { "epoch": 3.1921701486918876, "grad_norm": 5.776913642883301, "learning_rate": 7.23131940523245e-06, "loss": 0.4947, "step": 16960 }, { "epoch": 3.194052324487107, "grad_norm": 1.355134129524231, "learning_rate": 7.223790702051572e-06, "loss": 0.4341, "step": 16970 }, { "epoch": 3.1959345002823265, "grad_norm": 8.270890235900879, "learning_rate": 7.216261998870695e-06, "loss": 0.6211, "step": 16980 }, { "epoch": 3.1978166760775455, "grad_norm": 9.533167839050293, "learning_rate": 7.208733295689818e-06, "loss": 0.6194, "step": 16990 }, { "epoch": 3.199698851872765, "grad_norm": 8.640917778015137, "learning_rate": 7.201204592508941e-06, "loss": 0.9036, "step": 17000 }, { "epoch": 3.2015810276679844, "grad_norm": 19.37293243408203, "learning_rate": 7.193675889328063e-06, "loss": 0.7672, "step": 17010 }, { "epoch": 3.2034632034632033, "grad_norm": 11.042953491210938, "learning_rate": 7.186147186147187e-06, "loss": 0.9198, "step": 17020 }, { "epoch": 3.2053453792584228, "grad_norm": 9.942525863647461, "learning_rate": 7.17861848296631e-06, "loss": 0.384, "step": 17030 }, { "epoch": 3.207227555053642, "grad_norm": 33.597999572753906, "learning_rate": 7.171089779785433e-06, "loss": 0.9105, "step": 17040 }, { "epoch": 3.209109730848861, "grad_norm": 14.744696617126465, "learning_rate": 7.163561076604555e-06, "loss": 0.574, "step": 17050 }, { "epoch": 3.2109919066440806, "grad_norm": 14.952895164489746, "learning_rate": 7.1560323734236784e-06, "loss": 0.662, "step": 17060 }, { "epoch": 3.2128740824393, "grad_norm": 7.853272914886475, "learning_rate": 7.148503670242801e-06, "loss": 0.9395, "step": 17070 }, { "epoch": 3.214756258234519, "grad_norm": 16.070457458496094, "learning_rate": 7.140974967061924e-06, "loss": 0.6052, "step": 17080 }, { "epoch": 3.2166384340297385, "grad_norm": 7.213963508605957, "learning_rate": 7.133446263881047e-06, "loss": 0.6311, "step": 17090 }, { "epoch": 3.2185206098249575, "grad_norm": 26.420625686645508, "learning_rate": 7.125917560700169e-06, "loss": 0.7319, "step": 17100 }, { "epoch": 3.220402785620177, "grad_norm": 12.303365707397461, "learning_rate": 7.118388857519293e-06, "loss": 0.8293, "step": 17110 }, { "epoch": 3.2222849614153963, "grad_norm": 19.405977249145508, "learning_rate": 7.110860154338416e-06, "loss": 0.6416, "step": 17120 }, { "epoch": 3.2241671372106153, "grad_norm": 20.179763793945312, "learning_rate": 7.103331451157539e-06, "loss": 0.635, "step": 17130 }, { "epoch": 3.2260493130058348, "grad_norm": 8.78466510772705, "learning_rate": 7.095802747976661e-06, "loss": 0.5423, "step": 17140 }, { "epoch": 3.227931488801054, "grad_norm": 39.822975158691406, "learning_rate": 7.088274044795784e-06, "loss": 0.6596, "step": 17150 }, { "epoch": 3.229813664596273, "grad_norm": 15.491219520568848, "learning_rate": 7.080745341614908e-06, "loss": 0.8531, "step": 17160 }, { "epoch": 3.2316958403914926, "grad_norm": 14.816864967346191, "learning_rate": 7.073216638434031e-06, "loss": 0.8523, "step": 17170 }, { "epoch": 3.233578016186712, "grad_norm": 2.6879067420959473, "learning_rate": 7.0656879352531535e-06, "loss": 0.4871, "step": 17180 }, { "epoch": 3.235460191981931, "grad_norm": 15.283522605895996, "learning_rate": 7.058159232072276e-06, "loss": 0.7526, "step": 17190 }, { "epoch": 3.2373423677771505, "grad_norm": 14.024410247802734, "learning_rate": 7.050630528891399e-06, "loss": 0.5458, "step": 17200 }, { "epoch": 3.2392245435723694, "grad_norm": 17.729820251464844, "learning_rate": 7.043101825710522e-06, "loss": 0.7159, "step": 17210 }, { "epoch": 3.241106719367589, "grad_norm": 11.570318222045898, "learning_rate": 7.035573122529645e-06, "loss": 0.8422, "step": 17220 }, { "epoch": 3.2429888951628083, "grad_norm": 29.138668060302734, "learning_rate": 7.028044419348767e-06, "loss": 0.7466, "step": 17230 }, { "epoch": 3.2448710709580273, "grad_norm": 27.035675048828125, "learning_rate": 7.02051571616789e-06, "loss": 0.5832, "step": 17240 }, { "epoch": 3.2467532467532467, "grad_norm": 12.801972389221191, "learning_rate": 7.012987012987014e-06, "loss": 0.7841, "step": 17250 }, { "epoch": 3.248635422548466, "grad_norm": 16.305395126342773, "learning_rate": 7.005458309806137e-06, "loss": 0.7556, "step": 17260 }, { "epoch": 3.250517598343685, "grad_norm": 23.742103576660156, "learning_rate": 6.997929606625259e-06, "loss": 0.6456, "step": 17270 }, { "epoch": 3.2523997741389046, "grad_norm": 14.787322044372559, "learning_rate": 6.990400903444382e-06, "loss": 0.7138, "step": 17280 }, { "epoch": 3.254281949934124, "grad_norm": 15.448890686035156, "learning_rate": 6.982872200263505e-06, "loss": 0.4583, "step": 17290 }, { "epoch": 3.256164125729343, "grad_norm": 17.958166122436523, "learning_rate": 6.9753434970826286e-06, "loss": 0.5352, "step": 17300 }, { "epoch": 3.2580463015245624, "grad_norm": 29.221757888793945, "learning_rate": 6.9678147939017515e-06, "loss": 0.7444, "step": 17310 }, { "epoch": 3.259928477319782, "grad_norm": 5.633476734161377, "learning_rate": 6.9602860907208736e-06, "loss": 0.4572, "step": 17320 }, { "epoch": 3.261810653115001, "grad_norm": 11.22506046295166, "learning_rate": 6.9527573875399965e-06, "loss": 0.5099, "step": 17330 }, { "epoch": 3.2636928289102203, "grad_norm": 10.039155960083008, "learning_rate": 6.94522868435912e-06, "loss": 0.5323, "step": 17340 }, { "epoch": 3.2655750047054397, "grad_norm": 9.84754467010498, "learning_rate": 6.937699981178243e-06, "loss": 0.7944, "step": 17350 }, { "epoch": 3.2674571805006587, "grad_norm": 34.082881927490234, "learning_rate": 6.930171277997365e-06, "loss": 0.7083, "step": 17360 }, { "epoch": 3.269339356295878, "grad_norm": 16.67072105407715, "learning_rate": 6.922642574816488e-06, "loss": 0.8544, "step": 17370 }, { "epoch": 3.271221532091097, "grad_norm": 15.253617286682129, "learning_rate": 6.915113871635611e-06, "loss": 0.6721, "step": 17380 }, { "epoch": 3.2731037078863165, "grad_norm": 12.421513557434082, "learning_rate": 6.907585168454735e-06, "loss": 0.55, "step": 17390 }, { "epoch": 3.274985883681536, "grad_norm": 7.298317909240723, "learning_rate": 6.900056465273857e-06, "loss": 0.6685, "step": 17400 }, { "epoch": 3.276868059476755, "grad_norm": 10.030412673950195, "learning_rate": 6.89252776209298e-06, "loss": 0.9101, "step": 17410 }, { "epoch": 3.2787502352719744, "grad_norm": 12.07267951965332, "learning_rate": 6.884999058912103e-06, "loss": 0.6002, "step": 17420 }, { "epoch": 3.280632411067194, "grad_norm": 9.45308780670166, "learning_rate": 6.8774703557312265e-06, "loss": 0.6294, "step": 17430 }, { "epoch": 3.282514586862413, "grad_norm": 3.554718255996704, "learning_rate": 6.8699416525503495e-06, "loss": 0.5614, "step": 17440 }, { "epoch": 3.2843967626576323, "grad_norm": 29.615537643432617, "learning_rate": 6.8624129493694715e-06, "loss": 0.6053, "step": 17450 }, { "epoch": 3.2862789384528517, "grad_norm": 15.930276870727539, "learning_rate": 6.8548842461885945e-06, "loss": 0.3862, "step": 17460 }, { "epoch": 3.2881611142480707, "grad_norm": 25.522640228271484, "learning_rate": 6.847355543007717e-06, "loss": 0.5614, "step": 17470 }, { "epoch": 3.29004329004329, "grad_norm": 36.311302185058594, "learning_rate": 6.839826839826841e-06, "loss": 0.674, "step": 17480 }, { "epoch": 3.291925465838509, "grad_norm": 19.43730354309082, "learning_rate": 6.832298136645963e-06, "loss": 0.5512, "step": 17490 }, { "epoch": 3.2938076416337285, "grad_norm": 5.826242446899414, "learning_rate": 6.824769433465086e-06, "loss": 0.9262, "step": 17500 }, { "epoch": 3.295689817428948, "grad_norm": 11.90711498260498, "learning_rate": 6.817240730284209e-06, "loss": 0.7307, "step": 17510 }, { "epoch": 3.2975719932241674, "grad_norm": 35.005157470703125, "learning_rate": 6.809712027103332e-06, "loss": 0.752, "step": 17520 }, { "epoch": 3.2994541690193864, "grad_norm": 12.222931861877441, "learning_rate": 6.802183323922454e-06, "loss": 0.6441, "step": 17530 }, { "epoch": 3.301336344814606, "grad_norm": 14.466747283935547, "learning_rate": 6.794654620741578e-06, "loss": 0.6763, "step": 17540 }, { "epoch": 3.303218520609825, "grad_norm": 17.527795791625977, "learning_rate": 6.787125917560701e-06, "loss": 0.4761, "step": 17550 }, { "epoch": 3.3051006964050442, "grad_norm": 11.280905723571777, "learning_rate": 6.779597214379824e-06, "loss": 0.939, "step": 17560 }, { "epoch": 3.3069828722002637, "grad_norm": 19.548994064331055, "learning_rate": 6.7720685111989474e-06, "loss": 0.747, "step": 17570 }, { "epoch": 3.3088650479954826, "grad_norm": 28.56268310546875, "learning_rate": 6.764539808018069e-06, "loss": 0.7412, "step": 17580 }, { "epoch": 3.310747223790702, "grad_norm": 23.072845458984375, "learning_rate": 6.7570111048371924e-06, "loss": 0.8036, "step": 17590 }, { "epoch": 3.3126293995859215, "grad_norm": 1.0671284198760986, "learning_rate": 6.749482401656315e-06, "loss": 0.5617, "step": 17600 }, { "epoch": 3.3145115753811405, "grad_norm": 17.233400344848633, "learning_rate": 6.741953698475438e-06, "loss": 0.6841, "step": 17610 }, { "epoch": 3.31639375117636, "grad_norm": 11.159910202026367, "learning_rate": 6.73442499529456e-06, "loss": 0.9642, "step": 17620 }, { "epoch": 3.3182759269715794, "grad_norm": 7.40802001953125, "learning_rate": 6.726896292113684e-06, "loss": 0.5016, "step": 17630 }, { "epoch": 3.3201581027667983, "grad_norm": 19.014680862426758, "learning_rate": 6.719367588932807e-06, "loss": 0.7707, "step": 17640 }, { "epoch": 3.322040278562018, "grad_norm": 13.653407096862793, "learning_rate": 6.71183888575193e-06, "loss": 0.3072, "step": 17650 }, { "epoch": 3.3239224543572368, "grad_norm": 2.027129650115967, "learning_rate": 6.704310182571052e-06, "loss": 0.8146, "step": 17660 }, { "epoch": 3.325804630152456, "grad_norm": 12.079022407531738, "learning_rate": 6.696781479390175e-06, "loss": 0.6395, "step": 17670 }, { "epoch": 3.3276868059476756, "grad_norm": 40.99082565307617, "learning_rate": 6.689252776209299e-06, "loss": 0.8666, "step": 17680 }, { "epoch": 3.3295689817428946, "grad_norm": 10.816984176635742, "learning_rate": 6.681724073028422e-06, "loss": 0.7107, "step": 17690 }, { "epoch": 3.331451157538114, "grad_norm": 14.31318473815918, "learning_rate": 6.6741953698475446e-06, "loss": 0.8113, "step": 17700 }, { "epoch": 3.3333333333333335, "grad_norm": 37.56098175048828, "learning_rate": 6.666666666666667e-06, "loss": 0.7322, "step": 17710 }, { "epoch": 3.3352155091285525, "grad_norm": 7.864965915679932, "learning_rate": 6.65913796348579e-06, "loss": 0.7357, "step": 17720 }, { "epoch": 3.337097684923772, "grad_norm": 20.20433807373047, "learning_rate": 6.651609260304913e-06, "loss": 0.8787, "step": 17730 }, { "epoch": 3.3389798607189913, "grad_norm": 15.400228500366211, "learning_rate": 6.644080557124036e-06, "loss": 0.8685, "step": 17740 }, { "epoch": 3.3408620365142103, "grad_norm": 1.1104925870895386, "learning_rate": 6.636551853943158e-06, "loss": 0.5544, "step": 17750 }, { "epoch": 3.3427442123094298, "grad_norm": 16.083879470825195, "learning_rate": 6.629023150762281e-06, "loss": 0.6363, "step": 17760 }, { "epoch": 3.3446263881046487, "grad_norm": 18.40869903564453, "learning_rate": 6.621494447581405e-06, "loss": 0.9739, "step": 17770 }, { "epoch": 3.346508563899868, "grad_norm": 4.225210189819336, "learning_rate": 6.613965744400528e-06, "loss": 0.8647, "step": 17780 }, { "epoch": 3.3483907396950876, "grad_norm": 3.2670443058013916, "learning_rate": 6.60643704121965e-06, "loss": 0.5671, "step": 17790 }, { "epoch": 3.350272915490307, "grad_norm": 23.90607261657715, "learning_rate": 6.598908338038773e-06, "loss": 0.7112, "step": 17800 }, { "epoch": 3.352155091285526, "grad_norm": 12.343167304992676, "learning_rate": 6.591379634857896e-06, "loss": 0.7628, "step": 17810 }, { "epoch": 3.3540372670807455, "grad_norm": 5.21704626083374, "learning_rate": 6.58385093167702e-06, "loss": 0.5627, "step": 17820 }, { "epoch": 3.3559194428759644, "grad_norm": 15.107742309570312, "learning_rate": 6.5763222284961426e-06, "loss": 0.6367, "step": 17830 }, { "epoch": 3.357801618671184, "grad_norm": 13.048922538757324, "learning_rate": 6.568793525315265e-06, "loss": 0.6175, "step": 17840 }, { "epoch": 3.3596837944664033, "grad_norm": 18.35613441467285, "learning_rate": 6.5612648221343875e-06, "loss": 0.629, "step": 17850 }, { "epoch": 3.3615659702616223, "grad_norm": 15.142684936523438, "learning_rate": 6.553736118953511e-06, "loss": 0.8152, "step": 17860 }, { "epoch": 3.3634481460568417, "grad_norm": 27.841224670410156, "learning_rate": 6.546207415772634e-06, "loss": 0.7843, "step": 17870 }, { "epoch": 3.365330321852061, "grad_norm": 7.560338020324707, "learning_rate": 6.538678712591756e-06, "loss": 0.5603, "step": 17880 }, { "epoch": 3.36721249764728, "grad_norm": 21.53891944885254, "learning_rate": 6.531150009410879e-06, "loss": 0.615, "step": 17890 }, { "epoch": 3.3690946734424996, "grad_norm": 11.022046089172363, "learning_rate": 6.523621306230002e-06, "loss": 0.5921, "step": 17900 }, { "epoch": 3.370976849237719, "grad_norm": 11.184412956237793, "learning_rate": 6.516092603049126e-06, "loss": 0.7016, "step": 17910 }, { "epoch": 3.372859025032938, "grad_norm": 19.147296905517578, "learning_rate": 6.508563899868249e-06, "loss": 0.87, "step": 17920 }, { "epoch": 3.3747412008281574, "grad_norm": 8.234039306640625, "learning_rate": 6.501035196687371e-06, "loss": 0.8574, "step": 17930 }, { "epoch": 3.3766233766233764, "grad_norm": 19.271259307861328, "learning_rate": 6.493506493506494e-06, "loss": 0.9815, "step": 17940 }, { "epoch": 3.378505552418596, "grad_norm": 9.072455406188965, "learning_rate": 6.485977790325617e-06, "loss": 0.5195, "step": 17950 }, { "epoch": 3.3803877282138153, "grad_norm": 9.557124137878418, "learning_rate": 6.4784490871447405e-06, "loss": 0.6915, "step": 17960 }, { "epoch": 3.3822699040090343, "grad_norm": 2.8738791942596436, "learning_rate": 6.470920383963863e-06, "loss": 0.396, "step": 17970 }, { "epoch": 3.3841520798042537, "grad_norm": 12.238557815551758, "learning_rate": 6.4633916807829855e-06, "loss": 0.7431, "step": 17980 }, { "epoch": 3.386034255599473, "grad_norm": 14.482816696166992, "learning_rate": 6.4558629776021084e-06, "loss": 0.6753, "step": 17990 }, { "epoch": 3.387916431394692, "grad_norm": 7.765755653381348, "learning_rate": 6.448334274421232e-06, "loss": 0.6066, "step": 18000 }, { "epoch": 3.3897986071899116, "grad_norm": 30.565616607666016, "learning_rate": 6.440805571240354e-06, "loss": 0.6136, "step": 18010 }, { "epoch": 3.391680782985131, "grad_norm": 13.92880916595459, "learning_rate": 6.433276868059477e-06, "loss": 0.6848, "step": 18020 }, { "epoch": 3.39356295878035, "grad_norm": 9.684536933898926, "learning_rate": 6.4257481648786e-06, "loss": 0.6168, "step": 18030 }, { "epoch": 3.3954451345755694, "grad_norm": 3.2142350673675537, "learning_rate": 6.418219461697723e-06, "loss": 0.6285, "step": 18040 }, { "epoch": 3.3973273103707884, "grad_norm": 6.0369977951049805, "learning_rate": 6.410690758516847e-06, "loss": 0.6715, "step": 18050 }, { "epoch": 3.399209486166008, "grad_norm": 16.24129295349121, "learning_rate": 6.403162055335969e-06, "loss": 0.8986, "step": 18060 }, { "epoch": 3.4010916619612273, "grad_norm": 26.860872268676758, "learning_rate": 6.395633352155092e-06, "loss": 0.6195, "step": 18070 }, { "epoch": 3.4029738377564467, "grad_norm": 11.599557876586914, "learning_rate": 6.388104648974215e-06, "loss": 0.6924, "step": 18080 }, { "epoch": 3.4048560135516657, "grad_norm": 21.338090896606445, "learning_rate": 6.380575945793338e-06, "loss": 0.6343, "step": 18090 }, { "epoch": 3.406738189346885, "grad_norm": 17.614233016967773, "learning_rate": 6.37304724261246e-06, "loss": 0.5183, "step": 18100 }, { "epoch": 3.408620365142104, "grad_norm": 58.38087844848633, "learning_rate": 6.3655185394315835e-06, "loss": 0.6562, "step": 18110 }, { "epoch": 3.4105025409373235, "grad_norm": 21.725601196289062, "learning_rate": 6.3579898362507064e-06, "loss": 0.5947, "step": 18120 }, { "epoch": 3.412384716732543, "grad_norm": 31.378341674804688, "learning_rate": 6.350461133069829e-06, "loss": 0.6798, "step": 18130 }, { "epoch": 3.414266892527762, "grad_norm": 0.62470942735672, "learning_rate": 6.342932429888951e-06, "loss": 0.669, "step": 18140 }, { "epoch": 3.4161490683229814, "grad_norm": 6.63415002822876, "learning_rate": 6.335403726708075e-06, "loss": 0.6178, "step": 18150 }, { "epoch": 3.418031244118201, "grad_norm": 5.154765605926514, "learning_rate": 6.327875023527198e-06, "loss": 0.666, "step": 18160 }, { "epoch": 3.41991341991342, "grad_norm": 9.039029121398926, "learning_rate": 6.320346320346321e-06, "loss": 0.4487, "step": 18170 }, { "epoch": 3.4217955957086392, "grad_norm": 4.979629039764404, "learning_rate": 6.312817617165444e-06, "loss": 0.5112, "step": 18180 }, { "epoch": 3.4236777715038587, "grad_norm": 14.24013614654541, "learning_rate": 6.305288913984566e-06, "loss": 0.4772, "step": 18190 }, { "epoch": 3.4255599472990776, "grad_norm": 20.53621482849121, "learning_rate": 6.29776021080369e-06, "loss": 0.4101, "step": 18200 }, { "epoch": 3.427442123094297, "grad_norm": 10.990429878234863, "learning_rate": 6.290231507622813e-06, "loss": 0.5397, "step": 18210 }, { "epoch": 3.429324298889516, "grad_norm": 11.857673645019531, "learning_rate": 6.282702804441936e-06, "loss": 0.6357, "step": 18220 }, { "epoch": 3.4312064746847355, "grad_norm": 6.039271354675293, "learning_rate": 6.275174101261058e-06, "loss": 0.5146, "step": 18230 }, { "epoch": 3.433088650479955, "grad_norm": 14.336277961730957, "learning_rate": 6.267645398080181e-06, "loss": 0.2947, "step": 18240 }, { "epoch": 3.434970826275174, "grad_norm": 24.33298110961914, "learning_rate": 6.260116694899304e-06, "loss": 0.9247, "step": 18250 }, { "epoch": 3.4368530020703933, "grad_norm": 12.793835639953613, "learning_rate": 6.252587991718427e-06, "loss": 0.4952, "step": 18260 }, { "epoch": 3.438735177865613, "grad_norm": 7.1921234130859375, "learning_rate": 6.245059288537549e-06, "loss": 0.7946, "step": 18270 }, { "epoch": 3.4406173536608318, "grad_norm": 21.25719451904297, "learning_rate": 6.237530585356672e-06, "loss": 0.73, "step": 18280 }, { "epoch": 3.442499529456051, "grad_norm": 12.582639694213867, "learning_rate": 6.230001882175796e-06, "loss": 0.6567, "step": 18290 }, { "epoch": 3.4443817052512706, "grad_norm": 2.455739736557007, "learning_rate": 6.222473178994919e-06, "loss": 0.5813, "step": 18300 }, { "epoch": 3.4462638810464896, "grad_norm": 15.017657279968262, "learning_rate": 6.214944475814042e-06, "loss": 0.7049, "step": 18310 }, { "epoch": 3.448146056841709, "grad_norm": 10.231067657470703, "learning_rate": 6.207415772633164e-06, "loss": 0.4853, "step": 18320 }, { "epoch": 3.4500282326369285, "grad_norm": 0.6758200526237488, "learning_rate": 6.199887069452287e-06, "loss": 0.6545, "step": 18330 }, { "epoch": 3.4519104084321475, "grad_norm": 2.3873395919799805, "learning_rate": 6.192358366271411e-06, "loss": 0.6442, "step": 18340 }, { "epoch": 3.453792584227367, "grad_norm": 8.824675559997559, "learning_rate": 6.184829663090534e-06, "loss": 0.6202, "step": 18350 }, { "epoch": 3.4556747600225863, "grad_norm": 1.2755610942840576, "learning_rate": 6.177300959909656e-06, "loss": 0.6583, "step": 18360 }, { "epoch": 3.4575569358178053, "grad_norm": 8.281763076782227, "learning_rate": 6.169772256728779e-06, "loss": 0.5532, "step": 18370 }, { "epoch": 3.4594391116130248, "grad_norm": 19.429561614990234, "learning_rate": 6.1622435535479015e-06, "loss": 0.6067, "step": 18380 }, { "epoch": 3.4613212874082437, "grad_norm": 5.232605457305908, "learning_rate": 6.154714850367025e-06, "loss": 0.702, "step": 18390 }, { "epoch": 3.463203463203463, "grad_norm": 38.510826110839844, "learning_rate": 6.147186147186147e-06, "loss": 0.6362, "step": 18400 }, { "epoch": 3.4650856389986826, "grad_norm": 5.245325088500977, "learning_rate": 6.13965744400527e-06, "loss": 0.7006, "step": 18410 }, { "epoch": 3.4669678147939016, "grad_norm": 42.82587814331055, "learning_rate": 6.132128740824393e-06, "loss": 0.9136, "step": 18420 }, { "epoch": 3.468849990589121, "grad_norm": 18.611064910888672, "learning_rate": 6.124600037643517e-06, "loss": 0.8132, "step": 18430 }, { "epoch": 3.4707321663843405, "grad_norm": 24.1761417388916, "learning_rate": 6.11707133446264e-06, "loss": 0.7517, "step": 18440 }, { "epoch": 3.4726143421795594, "grad_norm": 14.236517906188965, "learning_rate": 6.109542631281762e-06, "loss": 0.7883, "step": 18450 }, { "epoch": 3.474496517974779, "grad_norm": 23.19389533996582, "learning_rate": 6.102013928100885e-06, "loss": 0.6778, "step": 18460 }, { "epoch": 3.4763786937699983, "grad_norm": 17.408361434936523, "learning_rate": 6.094485224920008e-06, "loss": 0.425, "step": 18470 }, { "epoch": 3.4782608695652173, "grad_norm": 1.1383931636810303, "learning_rate": 6.086956521739132e-06, "loss": 0.6786, "step": 18480 }, { "epoch": 3.4801430453604367, "grad_norm": 18.74462890625, "learning_rate": 6.079427818558254e-06, "loss": 0.5072, "step": 18490 }, { "epoch": 3.4820252211556557, "grad_norm": 12.033364295959473, "learning_rate": 6.071899115377377e-06, "loss": 0.8424, "step": 18500 }, { "epoch": 3.483907396950875, "grad_norm": 13.617997169494629, "learning_rate": 6.0643704121964995e-06, "loss": 0.4691, "step": 18510 }, { "epoch": 3.4857895727460946, "grad_norm": 18.998056411743164, "learning_rate": 6.0568417090156224e-06, "loss": 0.9883, "step": 18520 }, { "epoch": 3.4876717485413136, "grad_norm": 13.122930526733398, "learning_rate": 6.049313005834746e-06, "loss": 0.7358, "step": 18530 }, { "epoch": 3.489553924336533, "grad_norm": 57.448787689208984, "learning_rate": 6.041784302653868e-06, "loss": 0.4635, "step": 18540 }, { "epoch": 3.4914361001317524, "grad_norm": 7.738725662231445, "learning_rate": 6.034255599472991e-06, "loss": 0.6152, "step": 18550 }, { "epoch": 3.4933182759269714, "grad_norm": 6.369852542877197, "learning_rate": 6.026726896292114e-06, "loss": 0.7051, "step": 18560 }, { "epoch": 3.495200451722191, "grad_norm": 9.485119819641113, "learning_rate": 6.019198193111238e-06, "loss": 0.5485, "step": 18570 }, { "epoch": 3.4970826275174103, "grad_norm": 14.34579849243164, "learning_rate": 6.01166948993036e-06, "loss": 0.5845, "step": 18580 }, { "epoch": 3.4989648033126293, "grad_norm": 8.484230995178223, "learning_rate": 6.004140786749483e-06, "loss": 0.5187, "step": 18590 }, { "epoch": 3.5008469791078487, "grad_norm": 9.235902786254883, "learning_rate": 5.996612083568606e-06, "loss": 0.7469, "step": 18600 }, { "epoch": 3.5027291549030677, "grad_norm": 19.91741371154785, "learning_rate": 5.989083380387729e-06, "loss": 0.5219, "step": 18610 }, { "epoch": 3.504611330698287, "grad_norm": 33.60355758666992, "learning_rate": 5.981554677206851e-06, "loss": 0.8795, "step": 18620 }, { "epoch": 3.5064935064935066, "grad_norm": 7.930181503295898, "learning_rate": 5.9740259740259746e-06, "loss": 0.7002, "step": 18630 }, { "epoch": 3.508375682288726, "grad_norm": 25.92783546447754, "learning_rate": 5.9664972708450975e-06, "loss": 0.6384, "step": 18640 }, { "epoch": 3.510257858083945, "grad_norm": 7.213856220245361, "learning_rate": 5.95896856766422e-06, "loss": 0.5609, "step": 18650 }, { "epoch": 3.5121400338791644, "grad_norm": 29.71574592590332, "learning_rate": 5.951439864483344e-06, "loss": 1.0729, "step": 18660 }, { "epoch": 3.5140222096743834, "grad_norm": 26.87563705444336, "learning_rate": 5.943911161302465e-06, "loss": 0.6124, "step": 18670 }, { "epoch": 3.515904385469603, "grad_norm": 5.8294782638549805, "learning_rate": 5.936382458121589e-06, "loss": 0.4782, "step": 18680 }, { "epoch": 3.5177865612648223, "grad_norm": 7.82859468460083, "learning_rate": 5.928853754940712e-06, "loss": 0.5349, "step": 18690 }, { "epoch": 3.5196687370600412, "grad_norm": 13.210909843444824, "learning_rate": 5.921325051759835e-06, "loss": 0.6603, "step": 18700 }, { "epoch": 3.5215509128552607, "grad_norm": 9.926183700561523, "learning_rate": 5.913796348578957e-06, "loss": 0.5646, "step": 18710 }, { "epoch": 3.52343308865048, "grad_norm": 12.920879364013672, "learning_rate": 5.906267645398081e-06, "loss": 0.6175, "step": 18720 }, { "epoch": 3.525315264445699, "grad_norm": 20.73908233642578, "learning_rate": 5.898738942217204e-06, "loss": 0.7321, "step": 18730 }, { "epoch": 3.5271974402409185, "grad_norm": 4.936865329742432, "learning_rate": 5.891210239036327e-06, "loss": 0.8867, "step": 18740 }, { "epoch": 3.529079616036138, "grad_norm": 16.362173080444336, "learning_rate": 5.883681535855449e-06, "loss": 0.6746, "step": 18750 }, { "epoch": 3.530961791831357, "grad_norm": 12.123333930969238, "learning_rate": 5.876152832674572e-06, "loss": 0.5928, "step": 18760 }, { "epoch": 3.5328439676265764, "grad_norm": 16.486318588256836, "learning_rate": 5.8686241294936955e-06, "loss": 0.6531, "step": 18770 }, { "epoch": 3.5347261434217954, "grad_norm": 5.473000526428223, "learning_rate": 5.861095426312818e-06, "loss": 0.7118, "step": 18780 }, { "epoch": 3.536608319217015, "grad_norm": 12.31059741973877, "learning_rate": 5.853566723131941e-06, "loss": 0.613, "step": 18790 }, { "epoch": 3.5384904950122342, "grad_norm": 10.703392028808594, "learning_rate": 5.846038019951063e-06, "loss": 0.6473, "step": 18800 }, { "epoch": 3.5403726708074537, "grad_norm": 53.403602600097656, "learning_rate": 5.838509316770186e-06, "loss": 0.6796, "step": 18810 }, { "epoch": 3.5422548466026726, "grad_norm": 10.487499237060547, "learning_rate": 5.83098061358931e-06, "loss": 0.5779, "step": 18820 }, { "epoch": 3.544137022397892, "grad_norm": 11.738143920898438, "learning_rate": 5.823451910408433e-06, "loss": 0.7215, "step": 18830 }, { "epoch": 3.546019198193111, "grad_norm": 13.654657363891602, "learning_rate": 5.815923207227555e-06, "loss": 0.6334, "step": 18840 }, { "epoch": 3.5479013739883305, "grad_norm": 14.808473587036133, "learning_rate": 5.808394504046678e-06, "loss": 0.7413, "step": 18850 }, { "epoch": 3.54978354978355, "grad_norm": 17.71009635925293, "learning_rate": 5.800865800865802e-06, "loss": 0.9974, "step": 18860 }, { "epoch": 3.551665725578769, "grad_norm": 15.377747535705566, "learning_rate": 5.793337097684925e-06, "loss": 0.5763, "step": 18870 }, { "epoch": 3.5535479013739883, "grad_norm": 12.32382869720459, "learning_rate": 5.785808394504047e-06, "loss": 0.4418, "step": 18880 }, { "epoch": 3.5554300771692073, "grad_norm": 12.920076370239258, "learning_rate": 5.77827969132317e-06, "loss": 0.6994, "step": 18890 }, { "epoch": 3.5573122529644268, "grad_norm": 15.59774112701416, "learning_rate": 5.770750988142293e-06, "loss": 0.5499, "step": 18900 }, { "epoch": 3.559194428759646, "grad_norm": 22.626419067382812, "learning_rate": 5.763222284961416e-06, "loss": 0.7776, "step": 18910 }, { "epoch": 3.5610766045548656, "grad_norm": 4.442861080169678, "learning_rate": 5.755693581780539e-06, "loss": 0.3559, "step": 18920 }, { "epoch": 3.5629587803500846, "grad_norm": 11.74802017211914, "learning_rate": 5.748164878599661e-06, "loss": 0.714, "step": 18930 }, { "epoch": 3.564840956145304, "grad_norm": 15.844473838806152, "learning_rate": 5.740636175418784e-06, "loss": 0.6158, "step": 18940 }, { "epoch": 3.566723131940523, "grad_norm": 32.16844940185547, "learning_rate": 5.733107472237908e-06, "loss": 0.8462, "step": 18950 }, { "epoch": 3.5686053077357425, "grad_norm": 28.286787033081055, "learning_rate": 5.725578769057031e-06, "loss": 0.6199, "step": 18960 }, { "epoch": 3.570487483530962, "grad_norm": 38.367637634277344, "learning_rate": 5.718050065876153e-06, "loss": 1.119, "step": 18970 }, { "epoch": 3.5723696593261813, "grad_norm": 8.818977355957031, "learning_rate": 5.710521362695276e-06, "loss": 0.6292, "step": 18980 }, { "epoch": 3.5742518351214003, "grad_norm": 54.24068832397461, "learning_rate": 5.702992659514399e-06, "loss": 0.5373, "step": 18990 }, { "epoch": 3.5761340109166198, "grad_norm": 17.338998794555664, "learning_rate": 5.695463956333523e-06, "loss": 0.6531, "step": 19000 }, { "epoch": 3.5780161867118387, "grad_norm": 9.126494407653809, "learning_rate": 5.687935253152645e-06, "loss": 0.8443, "step": 19010 }, { "epoch": 3.579898362507058, "grad_norm": 6.135142803192139, "learning_rate": 5.680406549971768e-06, "loss": 0.68, "step": 19020 }, { "epoch": 3.5817805383022776, "grad_norm": 39.82160568237305, "learning_rate": 5.672877846790891e-06, "loss": 0.4722, "step": 19030 }, { "epoch": 3.5836627140974966, "grad_norm": 1.3005073070526123, "learning_rate": 5.6653491436100135e-06, "loss": 0.5634, "step": 19040 }, { "epoch": 3.585544889892716, "grad_norm": 2.85640811920166, "learning_rate": 5.657820440429137e-06, "loss": 0.5805, "step": 19050 }, { "epoch": 3.587427065687935, "grad_norm": 29.665571212768555, "learning_rate": 5.650291737248259e-06, "loss": 0.9181, "step": 19060 }, { "epoch": 3.5893092414831544, "grad_norm": 33.12388229370117, "learning_rate": 5.642763034067382e-06, "loss": 0.5326, "step": 19070 }, { "epoch": 3.591191417278374, "grad_norm": 4.879305362701416, "learning_rate": 5.635234330886505e-06, "loss": 0.6239, "step": 19080 }, { "epoch": 3.5930735930735933, "grad_norm": 33.6097297668457, "learning_rate": 5.627705627705629e-06, "loss": 0.7447, "step": 19090 }, { "epoch": 3.5949557688688123, "grad_norm": 27.47477912902832, "learning_rate": 5.620176924524751e-06, "loss": 0.6465, "step": 19100 }, { "epoch": 3.5968379446640317, "grad_norm": 26.439849853515625, "learning_rate": 5.612648221343874e-06, "loss": 0.9106, "step": 19110 }, { "epoch": 3.5987201204592507, "grad_norm": 23.74281883239746, "learning_rate": 5.605119518162997e-06, "loss": 0.7119, "step": 19120 }, { "epoch": 3.60060229625447, "grad_norm": 27.2669677734375, "learning_rate": 5.59759081498212e-06, "loss": 0.6561, "step": 19130 }, { "epoch": 3.6024844720496896, "grad_norm": 10.283041000366211, "learning_rate": 5.590062111801242e-06, "loss": 0.5015, "step": 19140 }, { "epoch": 3.6043666478449086, "grad_norm": 5.571829795837402, "learning_rate": 5.582533408620366e-06, "loss": 0.3962, "step": 19150 }, { "epoch": 3.606248823640128, "grad_norm": 16.38119888305664, "learning_rate": 5.5750047054394886e-06, "loss": 0.9378, "step": 19160 }, { "epoch": 3.608130999435347, "grad_norm": 17.61245346069336, "learning_rate": 5.5674760022586115e-06, "loss": 0.7312, "step": 19170 }, { "epoch": 3.6100131752305664, "grad_norm": 117.15702056884766, "learning_rate": 5.559947299077734e-06, "loss": 0.4922, "step": 19180 }, { "epoch": 3.611895351025786, "grad_norm": 8.270508766174316, "learning_rate": 5.5524185958968565e-06, "loss": 0.8726, "step": 19190 }, { "epoch": 3.6137775268210053, "grad_norm": 39.04629135131836, "learning_rate": 5.54488989271598e-06, "loss": 0.7457, "step": 19200 }, { "epoch": 3.6156597026162243, "grad_norm": 43.17094802856445, "learning_rate": 5.537361189535103e-06, "loss": 0.7313, "step": 19210 }, { "epoch": 3.6175418784114437, "grad_norm": 22.584318161010742, "learning_rate": 5.529832486354226e-06, "loss": 0.4514, "step": 19220 }, { "epoch": 3.6194240542066627, "grad_norm": 25.183490753173828, "learning_rate": 5.522303783173348e-06, "loss": 0.5244, "step": 19230 }, { "epoch": 3.621306230001882, "grad_norm": 22.077592849731445, "learning_rate": 5.514775079992472e-06, "loss": 0.8154, "step": 19240 }, { "epoch": 3.6231884057971016, "grad_norm": 7.8474016189575195, "learning_rate": 5.507246376811595e-06, "loss": 0.7153, "step": 19250 }, { "epoch": 3.625070581592321, "grad_norm": 0.6397775411605835, "learning_rate": 5.499717673630718e-06, "loss": 0.7141, "step": 19260 }, { "epoch": 3.62695275738754, "grad_norm": 15.466002464294434, "learning_rate": 5.492188970449841e-06, "loss": 0.7019, "step": 19270 }, { "epoch": 3.6288349331827594, "grad_norm": 30.216781616210938, "learning_rate": 5.484660267268963e-06, "loss": 0.8668, "step": 19280 }, { "epoch": 3.6307171089779784, "grad_norm": 11.025100708007812, "learning_rate": 5.4771315640880865e-06, "loss": 0.6875, "step": 19290 }, { "epoch": 3.632599284773198, "grad_norm": 17.533348083496094, "learning_rate": 5.4696028609072095e-06, "loss": 0.4557, "step": 19300 }, { "epoch": 3.6344814605684173, "grad_norm": 29.006498336791992, "learning_rate": 5.462074157726332e-06, "loss": 0.7939, "step": 19310 }, { "epoch": 3.6363636363636362, "grad_norm": 18.006269454956055, "learning_rate": 5.4545454545454545e-06, "loss": 0.5706, "step": 19320 }, { "epoch": 3.6382458121588557, "grad_norm": 11.842194557189941, "learning_rate": 5.447016751364577e-06, "loss": 0.7348, "step": 19330 }, { "epoch": 3.6401279879540747, "grad_norm": 5.13842248916626, "learning_rate": 5.439488048183701e-06, "loss": 0.5898, "step": 19340 }, { "epoch": 3.642010163749294, "grad_norm": 2.101395845413208, "learning_rate": 5.431959345002824e-06, "loss": 0.6409, "step": 19350 }, { "epoch": 3.6438923395445135, "grad_norm": 8.308853149414062, "learning_rate": 5.424430641821946e-06, "loss": 0.8893, "step": 19360 }, { "epoch": 3.645774515339733, "grad_norm": 6.142594814300537, "learning_rate": 5.416901938641069e-06, "loss": 0.9554, "step": 19370 }, { "epoch": 3.647656691134952, "grad_norm": 4.47888708114624, "learning_rate": 5.409373235460193e-06, "loss": 0.8409, "step": 19380 }, { "epoch": 3.6495388669301714, "grad_norm": 9.741399765014648, "learning_rate": 5.401844532279316e-06, "loss": 0.66, "step": 19390 }, { "epoch": 3.6514210427253904, "grad_norm": 2.779855966567993, "learning_rate": 5.394315829098439e-06, "loss": 0.7436, "step": 19400 }, { "epoch": 3.65330321852061, "grad_norm": 19.320926666259766, "learning_rate": 5.386787125917561e-06, "loss": 0.6752, "step": 19410 }, { "epoch": 3.6551853943158292, "grad_norm": 27.487380981445312, "learning_rate": 5.379258422736684e-06, "loss": 0.7517, "step": 19420 }, { "epoch": 3.657067570111048, "grad_norm": 24.027660369873047, "learning_rate": 5.3717297195558074e-06, "loss": 0.6483, "step": 19430 }, { "epoch": 3.6589497459062676, "grad_norm": 9.766282081604004, "learning_rate": 5.36420101637493e-06, "loss": 0.5122, "step": 19440 }, { "epoch": 3.660831921701487, "grad_norm": 20.8446102142334, "learning_rate": 5.3566723131940524e-06, "loss": 0.7974, "step": 19450 }, { "epoch": 3.662714097496706, "grad_norm": 15.975231170654297, "learning_rate": 5.349143610013175e-06, "loss": 0.6866, "step": 19460 }, { "epoch": 3.6645962732919255, "grad_norm": 30.064542770385742, "learning_rate": 5.341614906832298e-06, "loss": 0.6704, "step": 19470 }, { "epoch": 3.666478449087145, "grad_norm": 16.590923309326172, "learning_rate": 5.334086203651422e-06, "loss": 0.7576, "step": 19480 }, { "epoch": 3.668360624882364, "grad_norm": 11.002893447875977, "learning_rate": 5.326557500470544e-06, "loss": 0.5608, "step": 19490 }, { "epoch": 3.6702428006775834, "grad_norm": 10.331709861755371, "learning_rate": 5.319028797289667e-06, "loss": 0.8275, "step": 19500 }, { "epoch": 3.6721249764728023, "grad_norm": 17.15074348449707, "learning_rate": 5.31150009410879e-06, "loss": 0.7005, "step": 19510 }, { "epoch": 3.6740071522680218, "grad_norm": 11.861909866333008, "learning_rate": 5.303971390927914e-06, "loss": 0.4037, "step": 19520 }, { "epoch": 3.675889328063241, "grad_norm": 17.04161262512207, "learning_rate": 5.296442687747037e-06, "loss": 0.5099, "step": 19530 }, { "epoch": 3.6777715038584606, "grad_norm": 20.979482650756836, "learning_rate": 5.288913984566159e-06, "loss": 0.5826, "step": 19540 }, { "epoch": 3.6796536796536796, "grad_norm": 18.8994140625, "learning_rate": 5.281385281385282e-06, "loss": 0.774, "step": 19550 }, { "epoch": 3.681535855448899, "grad_norm": 4.718742370605469, "learning_rate": 5.2738565782044046e-06, "loss": 0.7723, "step": 19560 }, { "epoch": 3.683418031244118, "grad_norm": 18.240686416625977, "learning_rate": 5.266327875023528e-06, "loss": 0.6407, "step": 19570 }, { "epoch": 3.6853002070393375, "grad_norm": 6.685810565948486, "learning_rate": 5.25879917184265e-06, "loss": 0.7073, "step": 19580 }, { "epoch": 3.687182382834557, "grad_norm": 15.091521263122559, "learning_rate": 5.251270468661773e-06, "loss": 0.6525, "step": 19590 }, { "epoch": 3.689064558629776, "grad_norm": 26.886425018310547, "learning_rate": 5.243741765480896e-06, "loss": 0.847, "step": 19600 }, { "epoch": 3.6909467344249953, "grad_norm": 11.26270866394043, "learning_rate": 5.236213062300019e-06, "loss": 0.5092, "step": 19610 }, { "epoch": 3.6928289102202143, "grad_norm": 17.176101684570312, "learning_rate": 5.228684359119141e-06, "loss": 0.6328, "step": 19620 }, { "epoch": 3.6947110860154337, "grad_norm": 15.91916275024414, "learning_rate": 5.221155655938265e-06, "loss": 0.6865, "step": 19630 }, { "epoch": 3.696593261810653, "grad_norm": 10.485185623168945, "learning_rate": 5.213626952757388e-06, "loss": 0.5489, "step": 19640 }, { "epoch": 3.6984754376058726, "grad_norm": 14.304201126098633, "learning_rate": 5.206098249576511e-06, "loss": 0.6536, "step": 19650 }, { "epoch": 3.7003576134010916, "grad_norm": 10.666767120361328, "learning_rate": 5.198569546395635e-06, "loss": 0.5502, "step": 19660 }, { "epoch": 3.702239789196311, "grad_norm": 13.534510612487793, "learning_rate": 5.191040843214757e-06, "loss": 0.5719, "step": 19670 }, { "epoch": 3.70412196499153, "grad_norm": 16.280838012695312, "learning_rate": 5.18351214003388e-06, "loss": 0.8457, "step": 19680 }, { "epoch": 3.7060041407867494, "grad_norm": 21.206565856933594, "learning_rate": 5.1759834368530025e-06, "loss": 0.4464, "step": 19690 }, { "epoch": 3.707886316581969, "grad_norm": 2.042757987976074, "learning_rate": 5.1684547336721255e-06, "loss": 0.4024, "step": 19700 }, { "epoch": 3.709768492377188, "grad_norm": 13.518824577331543, "learning_rate": 5.1609260304912475e-06, "loss": 0.723, "step": 19710 }, { "epoch": 3.7116506681724073, "grad_norm": 23.17169761657715, "learning_rate": 5.153397327310371e-06, "loss": 0.4086, "step": 19720 }, { "epoch": 3.7135328439676267, "grad_norm": 10.826898574829102, "learning_rate": 5.145868624129494e-06, "loss": 0.4519, "step": 19730 }, { "epoch": 3.7154150197628457, "grad_norm": 10.846261024475098, "learning_rate": 5.138339920948617e-06, "loss": 0.6457, "step": 19740 }, { "epoch": 3.717297195558065, "grad_norm": 6.996768474578857, "learning_rate": 5.130811217767739e-06, "loss": 0.5838, "step": 19750 }, { "epoch": 3.7191793713532846, "grad_norm": 2.6749439239501953, "learning_rate": 5.123282514586862e-06, "loss": 0.5056, "step": 19760 }, { "epoch": 3.7210615471485036, "grad_norm": 2.4390110969543457, "learning_rate": 5.115753811405986e-06, "loss": 0.6283, "step": 19770 }, { "epoch": 3.722943722943723, "grad_norm": 6.546153545379639, "learning_rate": 5.108225108225109e-06, "loss": 0.8119, "step": 19780 }, { "epoch": 3.724825898738942, "grad_norm": 22.23106575012207, "learning_rate": 5.100696405044232e-06, "loss": 0.7151, "step": 19790 }, { "epoch": 3.7267080745341614, "grad_norm": 15.080765724182129, "learning_rate": 5.093167701863354e-06, "loss": 0.629, "step": 19800 }, { "epoch": 3.728590250329381, "grad_norm": 13.616122245788574, "learning_rate": 5.085638998682478e-06, "loss": 0.6677, "step": 19810 }, { "epoch": 3.7304724261246003, "grad_norm": 3.014514684677124, "learning_rate": 5.0781102955016005e-06, "loss": 0.5376, "step": 19820 }, { "epoch": 3.7323546019198193, "grad_norm": 5.815634250640869, "learning_rate": 5.0705815923207234e-06, "loss": 0.6015, "step": 19830 }, { "epoch": 3.7342367777150387, "grad_norm": 9.241657257080078, "learning_rate": 5.0630528891398455e-06, "loss": 0.8149, "step": 19840 }, { "epoch": 3.7361189535102577, "grad_norm": 24.852537155151367, "learning_rate": 5.0555241859589684e-06, "loss": 0.4347, "step": 19850 }, { "epoch": 3.738001129305477, "grad_norm": 7.402453422546387, "learning_rate": 5.047995482778092e-06, "loss": 0.3041, "step": 19860 }, { "epoch": 3.7398833051006966, "grad_norm": 14.335990905761719, "learning_rate": 5.040466779597215e-06, "loss": 0.821, "step": 19870 }, { "epoch": 3.7417654808959155, "grad_norm": 16.409059524536133, "learning_rate": 5.032938076416338e-06, "loss": 0.9245, "step": 19880 }, { "epoch": 3.743647656691135, "grad_norm": 13.059428215026855, "learning_rate": 5.02540937323546e-06, "loss": 0.5391, "step": 19890 }, { "epoch": 3.745529832486354, "grad_norm": 14.962140083312988, "learning_rate": 5.017880670054583e-06, "loss": 0.4168, "step": 19900 }, { "epoch": 3.7474120082815734, "grad_norm": 8.028316497802734, "learning_rate": 5.010351966873707e-06, "loss": 0.5744, "step": 19910 }, { "epoch": 3.749294184076793, "grad_norm": 16.884626388549805, "learning_rate": 5.00282326369283e-06, "loss": 0.5241, "step": 19920 }, { "epoch": 3.7511763598720123, "grad_norm": 17.39742660522461, "learning_rate": 4.995294560511953e-06, "loss": 0.542, "step": 19930 }, { "epoch": 3.7530585356672312, "grad_norm": 13.098750114440918, "learning_rate": 4.987765857331075e-06, "loss": 0.7032, "step": 19940 }, { "epoch": 3.7549407114624507, "grad_norm": 10.768354415893555, "learning_rate": 4.9802371541501985e-06, "loss": 0.7465, "step": 19950 }, { "epoch": 3.7568228872576697, "grad_norm": 0.5966710448265076, "learning_rate": 4.9727084509693206e-06, "loss": 0.4171, "step": 19960 }, { "epoch": 3.758705063052889, "grad_norm": 3.4089717864990234, "learning_rate": 4.9651797477884435e-06, "loss": 0.538, "step": 19970 }, { "epoch": 3.7605872388481085, "grad_norm": 35.42552185058594, "learning_rate": 4.957651044607566e-06, "loss": 0.6669, "step": 19980 }, { "epoch": 3.762469414643328, "grad_norm": 24.851398468017578, "learning_rate": 4.950122341426689e-06, "loss": 0.4661, "step": 19990 }, { "epoch": 3.764351590438547, "grad_norm": 0.36463797092437744, "learning_rate": 4.942593638245812e-06, "loss": 0.4581, "step": 20000 }, { "epoch": 3.7662337662337664, "grad_norm": 3.793344736099243, "learning_rate": 4.935064935064935e-06, "loss": 0.4361, "step": 20010 }, { "epoch": 3.7681159420289854, "grad_norm": 12.976563453674316, "learning_rate": 4.927536231884059e-06, "loss": 0.6536, "step": 20020 }, { "epoch": 3.769998117824205, "grad_norm": 33.84760284423828, "learning_rate": 4.920007528703181e-06, "loss": 0.6451, "step": 20030 }, { "epoch": 3.7718802936194242, "grad_norm": 9.5062255859375, "learning_rate": 4.912478825522305e-06, "loss": 0.6208, "step": 20040 }, { "epoch": 3.773762469414643, "grad_norm": 16.10189437866211, "learning_rate": 4.904950122341427e-06, "loss": 0.6699, "step": 20050 }, { "epoch": 3.7756446452098626, "grad_norm": 29.097278594970703, "learning_rate": 4.89742141916055e-06, "loss": 0.7841, "step": 20060 }, { "epoch": 3.7775268210050816, "grad_norm": 12.631217002868652, "learning_rate": 4.889892715979673e-06, "loss": 0.5276, "step": 20070 }, { "epoch": 3.779408996800301, "grad_norm": 10.902958869934082, "learning_rate": 4.882364012798796e-06, "loss": 0.7378, "step": 20080 }, { "epoch": 3.7812911725955205, "grad_norm": 1.129839301109314, "learning_rate": 4.8748353096179186e-06, "loss": 0.5461, "step": 20090 }, { "epoch": 3.78317334839074, "grad_norm": 8.831612586975098, "learning_rate": 4.8673066064370415e-06, "loss": 0.5889, "step": 20100 }, { "epoch": 3.785055524185959, "grad_norm": 42.60455322265625, "learning_rate": 4.859777903256164e-06, "loss": 0.4435, "step": 20110 }, { "epoch": 3.7869376999811784, "grad_norm": 24.456605911254883, "learning_rate": 4.852249200075287e-06, "loss": 0.7794, "step": 20120 }, { "epoch": 3.7888198757763973, "grad_norm": 3.61232328414917, "learning_rate": 4.84472049689441e-06, "loss": 0.7502, "step": 20130 }, { "epoch": 3.7907020515716168, "grad_norm": 16.118135452270508, "learning_rate": 4.837191793713533e-06, "loss": 0.5583, "step": 20140 }, { "epoch": 3.792584227366836, "grad_norm": 25.525503158569336, "learning_rate": 4.829663090532656e-06, "loss": 0.8087, "step": 20150 }, { "epoch": 3.794466403162055, "grad_norm": 22.253944396972656, "learning_rate": 4.822134387351779e-06, "loss": 0.3545, "step": 20160 }, { "epoch": 3.7963485789572746, "grad_norm": 21.64002799987793, "learning_rate": 4.814605684170902e-06, "loss": 0.7379, "step": 20170 }, { "epoch": 3.7982307547524936, "grad_norm": 19.44178581237793, "learning_rate": 4.807076980990025e-06, "loss": 0.4395, "step": 20180 }, { "epoch": 3.800112930547713, "grad_norm": 13.115379333496094, "learning_rate": 4.799548277809148e-06, "loss": 0.6557, "step": 20190 }, { "epoch": 3.8019951063429325, "grad_norm": 4.139683246612549, "learning_rate": 4.792019574628271e-06, "loss": 0.5407, "step": 20200 }, { "epoch": 3.803877282138152, "grad_norm": 0.6734669208526611, "learning_rate": 4.784490871447394e-06, "loss": 0.4562, "step": 20210 }, { "epoch": 3.805759457933371, "grad_norm": 18.29108238220215, "learning_rate": 4.7769621682665165e-06, "loss": 0.6391, "step": 20220 }, { "epoch": 3.8076416337285903, "grad_norm": 20.505464553833008, "learning_rate": 4.7694334650856395e-06, "loss": 0.4091, "step": 20230 }, { "epoch": 3.8095238095238093, "grad_norm": 15.422745704650879, "learning_rate": 4.761904761904762e-06, "loss": 0.9266, "step": 20240 }, { "epoch": 3.8114059853190287, "grad_norm": 21.413530349731445, "learning_rate": 4.754376058723885e-06, "loss": 0.9884, "step": 20250 }, { "epoch": 3.813288161114248, "grad_norm": 6.468101501464844, "learning_rate": 4.746847355543008e-06, "loss": 0.8135, "step": 20260 }, { "epoch": 3.8151703369094676, "grad_norm": 9.80642032623291, "learning_rate": 4.739318652362131e-06, "loss": 0.3934, "step": 20270 }, { "epoch": 3.8170525127046866, "grad_norm": 8.44393253326416, "learning_rate": 4.731789949181254e-06, "loss": 0.643, "step": 20280 }, { "epoch": 3.818934688499906, "grad_norm": 4.562558174133301, "learning_rate": 4.724261246000377e-06, "loss": 0.6693, "step": 20290 }, { "epoch": 3.820816864295125, "grad_norm": 5.316037654876709, "learning_rate": 4.7167325428195e-06, "loss": 0.6138, "step": 20300 }, { "epoch": 3.8226990400903444, "grad_norm": 10.8194580078125, "learning_rate": 4.709203839638623e-06, "loss": 0.3942, "step": 20310 }, { "epoch": 3.824581215885564, "grad_norm": 22.41046142578125, "learning_rate": 4.701675136457746e-06, "loss": 0.299, "step": 20320 }, { "epoch": 3.826463391680783, "grad_norm": 6.464774131774902, "learning_rate": 4.694146433276869e-06, "loss": 0.4668, "step": 20330 }, { "epoch": 3.8283455674760023, "grad_norm": 0.8023673892021179, "learning_rate": 4.686617730095992e-06, "loss": 0.8259, "step": 20340 }, { "epoch": 3.8302277432712213, "grad_norm": 0.310139536857605, "learning_rate": 4.679089026915114e-06, "loss": 0.6507, "step": 20350 }, { "epoch": 3.8321099190664407, "grad_norm": 21.757549285888672, "learning_rate": 4.6715603237342374e-06, "loss": 0.6305, "step": 20360 }, { "epoch": 3.83399209486166, "grad_norm": 23.21905517578125, "learning_rate": 4.66403162055336e-06, "loss": 0.7793, "step": 20370 }, { "epoch": 3.8358742706568796, "grad_norm": 16.708805084228516, "learning_rate": 4.656502917372483e-06, "loss": 0.7145, "step": 20380 }, { "epoch": 3.8377564464520986, "grad_norm": 2.589465856552124, "learning_rate": 4.648974214191606e-06, "loss": 0.5476, "step": 20390 }, { "epoch": 3.839638622247318, "grad_norm": 21.747697830200195, "learning_rate": 4.641445511010729e-06, "loss": 0.5945, "step": 20400 }, { "epoch": 3.841520798042537, "grad_norm": 10.856985092163086, "learning_rate": 4.633916807829852e-06, "loss": 0.524, "step": 20410 }, { "epoch": 3.8434029738377564, "grad_norm": 19.395648956298828, "learning_rate": 4.626388104648974e-06, "loss": 0.743, "step": 20420 }, { "epoch": 3.845285149632976, "grad_norm": 26.709550857543945, "learning_rate": 4.618859401468098e-06, "loss": 0.4844, "step": 20430 }, { "epoch": 3.847167325428195, "grad_norm": 19.377134323120117, "learning_rate": 4.61133069828722e-06, "loss": 0.3497, "step": 20440 }, { "epoch": 3.8490495012234143, "grad_norm": 14.577184677124023, "learning_rate": 4.603801995106344e-06, "loss": 0.7667, "step": 20450 }, { "epoch": 3.8509316770186337, "grad_norm": 11.72651481628418, "learning_rate": 4.596273291925466e-06, "loss": 0.7216, "step": 20460 }, { "epoch": 3.8528138528138527, "grad_norm": 9.256969451904297, "learning_rate": 4.5887445887445896e-06, "loss": 0.6854, "step": 20470 }, { "epoch": 3.854696028609072, "grad_norm": 13.852824211120605, "learning_rate": 4.581215885563712e-06, "loss": 0.6446, "step": 20480 }, { "epoch": 3.8565782044042916, "grad_norm": 13.739072799682617, "learning_rate": 4.5736871823828346e-06, "loss": 0.7871, "step": 20490 }, { "epoch": 3.8584603801995105, "grad_norm": 22.21795654296875, "learning_rate": 4.566158479201958e-06, "loss": 0.5034, "step": 20500 }, { "epoch": 3.86034255599473, "grad_norm": 18.042375564575195, "learning_rate": 4.55862977602108e-06, "loss": 0.5784, "step": 20510 }, { "epoch": 3.862224731789949, "grad_norm": 16.496652603149414, "learning_rate": 4.551101072840204e-06, "loss": 0.7036, "step": 20520 }, { "epoch": 3.8641069075851684, "grad_norm": 23.149200439453125, "learning_rate": 4.543572369659326e-06, "loss": 0.7427, "step": 20530 }, { "epoch": 3.865989083380388, "grad_norm": 25.870445251464844, "learning_rate": 4.53604366647845e-06, "loss": 0.6495, "step": 20540 }, { "epoch": 3.8678712591756073, "grad_norm": 5.6248674392700195, "learning_rate": 4.528514963297572e-06, "loss": 0.6211, "step": 20550 }, { "epoch": 3.8697534349708262, "grad_norm": 3.3882083892822266, "learning_rate": 4.520986260116695e-06, "loss": 0.9215, "step": 20560 }, { "epoch": 3.8716356107660457, "grad_norm": 2.2890217304229736, "learning_rate": 4.513457556935818e-06, "loss": 0.6994, "step": 20570 }, { "epoch": 3.8735177865612647, "grad_norm": 1.9337209463119507, "learning_rate": 4.505928853754941e-06, "loss": 0.3197, "step": 20580 }, { "epoch": 3.875399962356484, "grad_norm": 12.150304794311523, "learning_rate": 4.498400150574064e-06, "loss": 0.616, "step": 20590 }, { "epoch": 3.8772821381517035, "grad_norm": 13.6411771774292, "learning_rate": 4.490871447393187e-06, "loss": 0.5819, "step": 20600 }, { "epoch": 3.8791643139469225, "grad_norm": 18.936782836914062, "learning_rate": 4.48334274421231e-06, "loss": 0.7718, "step": 20610 }, { "epoch": 3.881046489742142, "grad_norm": 29.339204788208008, "learning_rate": 4.4758140410314325e-06, "loss": 0.6563, "step": 20620 }, { "epoch": 3.882928665537361, "grad_norm": 19.04207420349121, "learning_rate": 4.4682853378505555e-06, "loss": 0.8119, "step": 20630 }, { "epoch": 3.8848108413325804, "grad_norm": 11.899054527282715, "learning_rate": 4.460756634669678e-06, "loss": 0.47, "step": 20640 }, { "epoch": 3.8866930171278, "grad_norm": 5.735147953033447, "learning_rate": 4.453227931488801e-06, "loss": 0.4137, "step": 20650 }, { "epoch": 3.8885751929230192, "grad_norm": 7.638981342315674, "learning_rate": 4.445699228307924e-06, "loss": 0.5822, "step": 20660 }, { "epoch": 3.890457368718238, "grad_norm": 22.939064025878906, "learning_rate": 4.438170525127047e-06, "loss": 0.8084, "step": 20670 }, { "epoch": 3.8923395445134576, "grad_norm": 8.393478393554688, "learning_rate": 4.43064182194617e-06, "loss": 0.5258, "step": 20680 }, { "epoch": 3.8942217203086766, "grad_norm": 17.132465362548828, "learning_rate": 4.423113118765293e-06, "loss": 0.5562, "step": 20690 }, { "epoch": 3.896103896103896, "grad_norm": 28.829774856567383, "learning_rate": 4.415584415584416e-06, "loss": 0.5178, "step": 20700 }, { "epoch": 3.8979860718991155, "grad_norm": 20.98097038269043, "learning_rate": 4.408055712403539e-06, "loss": 0.7207, "step": 20710 }, { "epoch": 3.8998682476943345, "grad_norm": 18.7142333984375, "learning_rate": 4.400527009222662e-06, "loss": 0.4549, "step": 20720 }, { "epoch": 3.901750423489554, "grad_norm": 6.946071147918701, "learning_rate": 4.392998306041785e-06, "loss": 0.4556, "step": 20730 }, { "epoch": 3.9036325992847734, "grad_norm": 21.66033363342285, "learning_rate": 4.385469602860908e-06, "loss": 0.8115, "step": 20740 }, { "epoch": 3.9055147750799923, "grad_norm": 9.17212963104248, "learning_rate": 4.3779408996800305e-06, "loss": 0.5632, "step": 20750 }, { "epoch": 3.9073969508752118, "grad_norm": 1.5829265117645264, "learning_rate": 4.3704121964991534e-06, "loss": 0.4644, "step": 20760 }, { "epoch": 3.909279126670431, "grad_norm": 26.870147705078125, "learning_rate": 4.362883493318276e-06, "loss": 0.4721, "step": 20770 }, { "epoch": 3.91116130246565, "grad_norm": 14.611478805541992, "learning_rate": 4.355354790137399e-06, "loss": 0.4564, "step": 20780 }, { "epoch": 3.9130434782608696, "grad_norm": 24.822168350219727, "learning_rate": 4.347826086956522e-06, "loss": 1.0165, "step": 20790 }, { "epoch": 3.9149256540560886, "grad_norm": 10.384610176086426, "learning_rate": 4.340297383775645e-06, "loss": 0.5887, "step": 20800 }, { "epoch": 3.916807829851308, "grad_norm": 6.775899887084961, "learning_rate": 4.332768680594768e-06, "loss": 0.4609, "step": 20810 }, { "epoch": 3.9186900056465275, "grad_norm": 22.89283561706543, "learning_rate": 4.325239977413891e-06, "loss": 0.8258, "step": 20820 }, { "epoch": 3.920572181441747, "grad_norm": 17.177522659301758, "learning_rate": 4.317711274233014e-06, "loss": 0.5948, "step": 20830 }, { "epoch": 3.922454357236966, "grad_norm": 9.44758415222168, "learning_rate": 4.310182571052137e-06, "loss": 0.7599, "step": 20840 }, { "epoch": 3.9243365330321853, "grad_norm": 9.103228569030762, "learning_rate": 4.302653867871259e-06, "loss": 0.5208, "step": 20850 }, { "epoch": 3.9262187088274043, "grad_norm": 15.446986198425293, "learning_rate": 4.295125164690383e-06, "loss": 0.4422, "step": 20860 }, { "epoch": 3.9281008846226237, "grad_norm": 15.556666374206543, "learning_rate": 4.287596461509506e-06, "loss": 0.5821, "step": 20870 }, { "epoch": 3.929983060417843, "grad_norm": 25.277069091796875, "learning_rate": 4.2800677583286285e-06, "loss": 0.6588, "step": 20880 }, { "epoch": 3.931865236213062, "grad_norm": 18.088552474975586, "learning_rate": 4.272539055147751e-06, "loss": 0.5822, "step": 20890 }, { "epoch": 3.9337474120082816, "grad_norm": 3.4164326190948486, "learning_rate": 4.265010351966874e-06, "loss": 0.2988, "step": 20900 }, { "epoch": 3.9356295878035006, "grad_norm": 24.451601028442383, "learning_rate": 4.257481648785997e-06, "loss": 0.5764, "step": 20910 }, { "epoch": 3.93751176359872, "grad_norm": 10.956971168518066, "learning_rate": 4.249952945605119e-06, "loss": 0.7125, "step": 20920 }, { "epoch": 3.9393939393939394, "grad_norm": 11.39134407043457, "learning_rate": 4.242424242424243e-06, "loss": 0.7606, "step": 20930 }, { "epoch": 3.941276115189159, "grad_norm": 7.814131736755371, "learning_rate": 4.234895539243365e-06, "loss": 0.4197, "step": 20940 }, { "epoch": 3.943158290984378, "grad_norm": 12.360709190368652, "learning_rate": 4.227366836062489e-06, "loss": 0.6645, "step": 20950 }, { "epoch": 3.9450404667795973, "grad_norm": 2.4350345134735107, "learning_rate": 4.219838132881611e-06, "loss": 0.6532, "step": 20960 }, { "epoch": 3.9469226425748163, "grad_norm": 0.28753194212913513, "learning_rate": 4.212309429700735e-06, "loss": 0.9288, "step": 20970 }, { "epoch": 3.9488048183700357, "grad_norm": 19.667470932006836, "learning_rate": 4.204780726519857e-06, "loss": 1.0559, "step": 20980 }, { "epoch": 3.950686994165255, "grad_norm": 4.4203081130981445, "learning_rate": 4.19725202333898e-06, "loss": 0.7958, "step": 20990 }, { "epoch": 3.9525691699604746, "grad_norm": 12.80313491821289, "learning_rate": 4.1897233201581036e-06, "loss": 0.4913, "step": 21000 }, { "epoch": 3.9544513457556936, "grad_norm": 27.15594482421875, "learning_rate": 4.182194616977226e-06, "loss": 0.5069, "step": 21010 }, { "epoch": 3.956333521550913, "grad_norm": 18.540098190307617, "learning_rate": 4.174665913796349e-06, "loss": 0.6035, "step": 21020 }, { "epoch": 3.958215697346132, "grad_norm": 22.570690155029297, "learning_rate": 4.1671372106154715e-06, "loss": 0.675, "step": 21030 }, { "epoch": 3.9600978731413514, "grad_norm": 23.07613182067871, "learning_rate": 4.159608507434595e-06, "loss": 0.5909, "step": 21040 }, { "epoch": 3.961980048936571, "grad_norm": 12.728482246398926, "learning_rate": 4.152079804253717e-06, "loss": 0.7034, "step": 21050 }, { "epoch": 3.96386222473179, "grad_norm": 30.579265594482422, "learning_rate": 4.14455110107284e-06, "loss": 0.5167, "step": 21060 }, { "epoch": 3.9657444005270093, "grad_norm": 17.927875518798828, "learning_rate": 4.137022397891963e-06, "loss": 0.6327, "step": 21070 }, { "epoch": 3.9676265763222283, "grad_norm": 14.641374588012695, "learning_rate": 4.129493694711086e-06, "loss": 0.7122, "step": 21080 }, { "epoch": 3.9695087521174477, "grad_norm": 16.535053253173828, "learning_rate": 4.121964991530209e-06, "loss": 0.8122, "step": 21090 }, { "epoch": 3.971390927912667, "grad_norm": 21.376726150512695, "learning_rate": 4.114436288349332e-06, "loss": 0.5726, "step": 21100 }, { "epoch": 3.9732731037078866, "grad_norm": 14.665657997131348, "learning_rate": 4.106907585168456e-06, "loss": 0.7047, "step": 21110 }, { "epoch": 3.9751552795031055, "grad_norm": 0.8747822046279907, "learning_rate": 4.099378881987578e-06, "loss": 0.5733, "step": 21120 }, { "epoch": 3.977037455298325, "grad_norm": 7.762773513793945, "learning_rate": 4.091850178806701e-06, "loss": 0.7884, "step": 21130 }, { "epoch": 3.978919631093544, "grad_norm": 15.392909049987793, "learning_rate": 4.084321475625824e-06, "loss": 0.6718, "step": 21140 }, { "epoch": 3.9808018068887634, "grad_norm": 34.64730453491211, "learning_rate": 4.0767927724449465e-06, "loss": 0.9371, "step": 21150 }, { "epoch": 3.982683982683983, "grad_norm": 5.796759605407715, "learning_rate": 4.0692640692640695e-06, "loss": 0.7274, "step": 21160 }, { "epoch": 3.984566158479202, "grad_norm": 17.814022064208984, "learning_rate": 4.061735366083192e-06, "loss": 0.6687, "step": 21170 }, { "epoch": 3.9864483342744212, "grad_norm": 9.838661193847656, "learning_rate": 4.054206662902315e-06, "loss": 0.8333, "step": 21180 }, { "epoch": 3.9883305100696402, "grad_norm": 9.50422191619873, "learning_rate": 4.046677959721438e-06, "loss": 0.498, "step": 21190 }, { "epoch": 3.9902126858648597, "grad_norm": 24.345027923583984, "learning_rate": 4.039149256540561e-06, "loss": 0.5806, "step": 21200 }, { "epoch": 3.992094861660079, "grad_norm": 17.970151901245117, "learning_rate": 4.031620553359684e-06, "loss": 0.4809, "step": 21210 }, { "epoch": 3.9939770374552985, "grad_norm": 6.761532783508301, "learning_rate": 4.024091850178807e-06, "loss": 0.546, "step": 21220 }, { "epoch": 3.9958592132505175, "grad_norm": 25.05278778076172, "learning_rate": 4.01656314699793e-06, "loss": 0.5847, "step": 21230 }, { "epoch": 3.997741389045737, "grad_norm": 25.217716217041016, "learning_rate": 4.009034443817053e-06, "loss": 0.7152, "step": 21240 }, { "epoch": 3.999623564840956, "grad_norm": 13.365795135498047, "learning_rate": 4.001505740636176e-06, "loss": 0.3327, "step": 21250 }, { "epoch": 4.0, "eval_accuracy": 0.9198666666666667, "eval_loss": 0.3090076744556427, "eval_runtime": 123.9046, "eval_samples_per_second": 60.53, "eval_steps_per_second": 7.57, "step": 21252 }, { "epoch": 4.001505740636175, "grad_norm": 3.5540566444396973, "learning_rate": 3.993977037455299e-06, "loss": 0.726, "step": 21260 }, { "epoch": 4.003387916431395, "grad_norm": 33.1975212097168, "learning_rate": 3.986448334274422e-06, "loss": 0.4819, "step": 21270 }, { "epoch": 4.005270092226614, "grad_norm": 6.789219379425049, "learning_rate": 3.9789196310935445e-06, "loss": 0.3697, "step": 21280 }, { "epoch": 4.007152268021834, "grad_norm": 20.950092315673828, "learning_rate": 3.9713909279126674e-06, "loss": 0.6637, "step": 21290 }, { "epoch": 4.009034443817052, "grad_norm": 7.9257707595825195, "learning_rate": 3.96386222473179e-06, "loss": 0.645, "step": 21300 }, { "epoch": 4.010916619612272, "grad_norm": 2.989037275314331, "learning_rate": 3.956333521550913e-06, "loss": 0.7831, "step": 21310 }, { "epoch": 4.012798795407491, "grad_norm": 23.511072158813477, "learning_rate": 3.948804818370036e-06, "loss": 0.4667, "step": 21320 }, { "epoch": 4.0146809712027105, "grad_norm": 20.884122848510742, "learning_rate": 3.941276115189159e-06, "loss": 0.8412, "step": 21330 }, { "epoch": 4.01656314699793, "grad_norm": 31.798341751098633, "learning_rate": 3.933747412008282e-06, "loss": 0.7123, "step": 21340 }, { "epoch": 4.0184453227931485, "grad_norm": 10.063611030578613, "learning_rate": 3.926218708827404e-06, "loss": 0.5005, "step": 21350 }, { "epoch": 4.020327498588368, "grad_norm": 13.424291610717773, "learning_rate": 3.918690005646528e-06, "loss": 0.5276, "step": 21360 }, { "epoch": 4.022209674383587, "grad_norm": 2.927164316177368, "learning_rate": 3.911161302465651e-06, "loss": 0.4943, "step": 21370 }, { "epoch": 4.024091850178807, "grad_norm": 21.97260856628418, "learning_rate": 3.903632599284774e-06, "loss": 0.773, "step": 21380 }, { "epoch": 4.025974025974026, "grad_norm": 14.4227876663208, "learning_rate": 3.896103896103897e-06, "loss": 0.8997, "step": 21390 }, { "epoch": 4.027856201769246, "grad_norm": 19.949077606201172, "learning_rate": 3.8885751929230196e-06, "loss": 0.4273, "step": 21400 }, { "epoch": 4.029738377564464, "grad_norm": 12.47314167022705, "learning_rate": 3.8810464897421425e-06, "loss": 0.8176, "step": 21410 }, { "epoch": 4.031620553359684, "grad_norm": 8.580794334411621, "learning_rate": 3.8735177865612646e-06, "loss": 0.7012, "step": 21420 }, { "epoch": 4.033502729154903, "grad_norm": 0.19655872881412506, "learning_rate": 3.865989083380388e-06, "loss": 0.6125, "step": 21430 }, { "epoch": 4.0353849049501225, "grad_norm": 39.343509674072266, "learning_rate": 3.85846038019951e-06, "loss": 0.3181, "step": 21440 }, { "epoch": 4.037267080745342, "grad_norm": 24.081876754760742, "learning_rate": 3.850931677018634e-06, "loss": 0.3806, "step": 21450 }, { "epoch": 4.0391492565405605, "grad_norm": 10.21368408203125, "learning_rate": 3.843402973837756e-06, "loss": 0.5367, "step": 21460 }, { "epoch": 4.04103143233578, "grad_norm": 3.1306040287017822, "learning_rate": 3.83587427065688e-06, "loss": 0.5962, "step": 21470 }, { "epoch": 4.042913608130999, "grad_norm": 0.1672428995370865, "learning_rate": 3.828345567476003e-06, "loss": 0.3698, "step": 21480 }, { "epoch": 4.044795783926219, "grad_norm": 22.689733505249023, "learning_rate": 3.820816864295126e-06, "loss": 0.4011, "step": 21490 }, { "epoch": 4.046677959721438, "grad_norm": 21.669288635253906, "learning_rate": 3.8132881611142488e-06, "loss": 0.945, "step": 21500 }, { "epoch": 4.048560135516658, "grad_norm": 27.549644470214844, "learning_rate": 3.8057594579333713e-06, "loss": 0.4268, "step": 21510 }, { "epoch": 4.050442311311876, "grad_norm": 1.7836955785751343, "learning_rate": 3.798230754752494e-06, "loss": 0.2654, "step": 21520 }, { "epoch": 4.052324487107096, "grad_norm": 21.68084716796875, "learning_rate": 3.790702051571617e-06, "loss": 0.4231, "step": 21530 }, { "epoch": 4.054206662902315, "grad_norm": 15.864786148071289, "learning_rate": 3.78317334839074e-06, "loss": 0.4331, "step": 21540 }, { "epoch": 4.0560888386975344, "grad_norm": 8.979389190673828, "learning_rate": 3.7756446452098625e-06, "loss": 0.8534, "step": 21550 }, { "epoch": 4.057971014492754, "grad_norm": 13.514737129211426, "learning_rate": 3.768115942028986e-06, "loss": 0.5054, "step": 21560 }, { "epoch": 4.059853190287973, "grad_norm": 6.7341766357421875, "learning_rate": 3.7605872388481084e-06, "loss": 0.495, "step": 21570 }, { "epoch": 4.061735366083192, "grad_norm": 8.057145118713379, "learning_rate": 3.7530585356672317e-06, "loss": 0.4183, "step": 21580 }, { "epoch": 4.063617541878411, "grad_norm": 17.93224334716797, "learning_rate": 3.7455298324863542e-06, "loss": 0.4697, "step": 21590 }, { "epoch": 4.065499717673631, "grad_norm": 12.62182331085205, "learning_rate": 3.7380011293054776e-06, "loss": 0.2779, "step": 21600 }, { "epoch": 4.06738189346885, "grad_norm": 1.0005717277526855, "learning_rate": 3.7304724261246005e-06, "loss": 0.6458, "step": 21610 }, { "epoch": 4.06926406926407, "grad_norm": 9.269535064697266, "learning_rate": 3.722943722943723e-06, "loss": 0.4826, "step": 21620 }, { "epoch": 4.071146245059288, "grad_norm": 0.3140579164028168, "learning_rate": 3.7154150197628463e-06, "loss": 0.6507, "step": 21630 }, { "epoch": 4.073028420854508, "grad_norm": 20.74129295349121, "learning_rate": 3.707886316581969e-06, "loss": 0.4982, "step": 21640 }, { "epoch": 4.074910596649727, "grad_norm": 2.1303975582122803, "learning_rate": 3.700357613401092e-06, "loss": 0.5702, "step": 21650 }, { "epoch": 4.076792772444946, "grad_norm": 0.7507879734039307, "learning_rate": 3.6928289102202147e-06, "loss": 0.4595, "step": 21660 }, { "epoch": 4.078674948240166, "grad_norm": 12.625158309936523, "learning_rate": 3.685300207039338e-06, "loss": 0.3126, "step": 21670 }, { "epoch": 4.080557124035385, "grad_norm": 4.407412052154541, "learning_rate": 3.6777715038584605e-06, "loss": 0.4585, "step": 21680 }, { "epoch": 4.082439299830604, "grad_norm": 33.84129333496094, "learning_rate": 3.670242800677584e-06, "loss": 0.7722, "step": 21690 }, { "epoch": 4.084321475625823, "grad_norm": 9.634953498840332, "learning_rate": 3.6627140974967064e-06, "loss": 0.5845, "step": 21700 }, { "epoch": 4.086203651421043, "grad_norm": 32.898746490478516, "learning_rate": 3.6551853943158293e-06, "loss": 0.4641, "step": 21710 }, { "epoch": 4.088085827216262, "grad_norm": 10.510771751403809, "learning_rate": 3.6476566911349526e-06, "loss": 0.7289, "step": 21720 }, { "epoch": 4.089968003011482, "grad_norm": 24.86658477783203, "learning_rate": 3.640127987954075e-06, "loss": 0.6348, "step": 21730 }, { "epoch": 4.0918501788067, "grad_norm": 14.314352989196777, "learning_rate": 3.6325992847731985e-06, "loss": 0.6093, "step": 21740 }, { "epoch": 4.0937323546019195, "grad_norm": 27.23653793334961, "learning_rate": 3.625070581592321e-06, "loss": 0.5205, "step": 21750 }, { "epoch": 4.095614530397139, "grad_norm": 19.099502563476562, "learning_rate": 3.6175418784114443e-06, "loss": 0.6852, "step": 21760 }, { "epoch": 4.097496706192358, "grad_norm": 20.99348258972168, "learning_rate": 3.610013175230567e-06, "loss": 0.6112, "step": 21770 }, { "epoch": 4.099378881987578, "grad_norm": 8.206368446350098, "learning_rate": 3.6024844720496897e-06, "loss": 0.688, "step": 21780 }, { "epoch": 4.101261057782797, "grad_norm": 7.729517459869385, "learning_rate": 3.5949557688688127e-06, "loss": 0.3646, "step": 21790 }, { "epoch": 4.103143233578016, "grad_norm": 142.78952026367188, "learning_rate": 3.5874270656879356e-06, "loss": 0.6408, "step": 21800 }, { "epoch": 4.105025409373235, "grad_norm": 7.343946933746338, "learning_rate": 3.579898362507058e-06, "loss": 0.6473, "step": 21810 }, { "epoch": 4.106907585168455, "grad_norm": 9.92300796508789, "learning_rate": 3.5723696593261814e-06, "loss": 0.436, "step": 21820 }, { "epoch": 4.108789760963674, "grad_norm": 22.44314193725586, "learning_rate": 3.564840956145304e-06, "loss": 0.6023, "step": 21830 }, { "epoch": 4.1106719367588935, "grad_norm": 14.589250564575195, "learning_rate": 3.5573122529644273e-06, "loss": 0.7282, "step": 21840 }, { "epoch": 4.112554112554113, "grad_norm": 9.727843284606934, "learning_rate": 3.54978354978355e-06, "loss": 0.3819, "step": 21850 }, { "epoch": 4.1144362883493315, "grad_norm": 6.11102819442749, "learning_rate": 3.542254846602673e-06, "loss": 0.6332, "step": 21860 }, { "epoch": 4.116318464144551, "grad_norm": 15.100151062011719, "learning_rate": 3.534726143421796e-06, "loss": 0.4667, "step": 21870 }, { "epoch": 4.11820063993977, "grad_norm": 22.583946228027344, "learning_rate": 3.5271974402409185e-06, "loss": 0.484, "step": 21880 }, { "epoch": 4.12008281573499, "grad_norm": 7.776749610900879, "learning_rate": 3.519668737060042e-06, "loss": 0.7223, "step": 21890 }, { "epoch": 4.121964991530209, "grad_norm": 17.078176498413086, "learning_rate": 3.5121400338791644e-06, "loss": 0.2882, "step": 21900 }, { "epoch": 4.123847167325428, "grad_norm": 20.212486267089844, "learning_rate": 3.5046113306982877e-06, "loss": 0.4445, "step": 21910 }, { "epoch": 4.125729343120647, "grad_norm": 1.8471107482910156, "learning_rate": 3.49708262751741e-06, "loss": 0.4017, "step": 21920 }, { "epoch": 4.127611518915867, "grad_norm": 7.003720283508301, "learning_rate": 3.4895539243365336e-06, "loss": 0.582, "step": 21930 }, { "epoch": 4.129493694711086, "grad_norm": 29.45206069946289, "learning_rate": 3.482025221155656e-06, "loss": 0.6031, "step": 21940 }, { "epoch": 4.1313758705063055, "grad_norm": 14.213915824890137, "learning_rate": 3.474496517974779e-06, "loss": 0.8854, "step": 21950 }, { "epoch": 4.133258046301525, "grad_norm": 9.18936538696289, "learning_rate": 3.466967814793902e-06, "loss": 0.4027, "step": 21960 }, { "epoch": 4.1351402220967435, "grad_norm": 2.583569049835205, "learning_rate": 3.459439111613025e-06, "loss": 0.6699, "step": 21970 }, { "epoch": 4.137022397891963, "grad_norm": 9.588484764099121, "learning_rate": 3.451910408432148e-06, "loss": 0.7421, "step": 21980 }, { "epoch": 4.138904573687182, "grad_norm": 9.309282302856445, "learning_rate": 3.4443817052512707e-06, "loss": 0.4973, "step": 21990 }, { "epoch": 4.140786749482402, "grad_norm": 18.542089462280273, "learning_rate": 3.436853002070394e-06, "loss": 0.6043, "step": 22000 }, { "epoch": 4.142668925277621, "grad_norm": 24.662412643432617, "learning_rate": 3.4293242988895165e-06, "loss": 0.6221, "step": 22010 }, { "epoch": 4.144551101072841, "grad_norm": 20.624713897705078, "learning_rate": 3.4217955957086394e-06, "loss": 0.7082, "step": 22020 }, { "epoch": 4.146433276868059, "grad_norm": 5.393800258636475, "learning_rate": 3.4142668925277623e-06, "loss": 0.7466, "step": 22030 }, { "epoch": 4.148315452663279, "grad_norm": 8.6702880859375, "learning_rate": 3.4067381893468853e-06, "loss": 0.4577, "step": 22040 }, { "epoch": 4.150197628458498, "grad_norm": 8.493791580200195, "learning_rate": 3.399209486166008e-06, "loss": 0.5303, "step": 22050 }, { "epoch": 4.1520798042537175, "grad_norm": 3.720416784286499, "learning_rate": 3.391680782985131e-06, "loss": 0.8215, "step": 22060 }, { "epoch": 4.153961980048937, "grad_norm": 3.568179130554199, "learning_rate": 3.3841520798042536e-06, "loss": 0.5173, "step": 22070 }, { "epoch": 4.1558441558441555, "grad_norm": 11.129983901977539, "learning_rate": 3.376623376623377e-06, "loss": 0.6122, "step": 22080 }, { "epoch": 4.157726331639375, "grad_norm": 13.840152740478516, "learning_rate": 3.3690946734425e-06, "loss": 0.624, "step": 22090 }, { "epoch": 4.159608507434594, "grad_norm": 28.01687240600586, "learning_rate": 3.361565970261623e-06, "loss": 0.6082, "step": 22100 }, { "epoch": 4.161490683229814, "grad_norm": 18.05573081970215, "learning_rate": 3.3540372670807457e-06, "loss": 0.319, "step": 22110 }, { "epoch": 4.163372859025033, "grad_norm": 27.826292037963867, "learning_rate": 3.3465085638998686e-06, "loss": 0.6604, "step": 22120 }, { "epoch": 4.165255034820253, "grad_norm": 25.494606018066406, "learning_rate": 3.3389798607189916e-06, "loss": 0.5413, "step": 22130 }, { "epoch": 4.167137210615471, "grad_norm": 34.23874282836914, "learning_rate": 3.331451157538114e-06, "loss": 0.7515, "step": 22140 }, { "epoch": 4.169019386410691, "grad_norm": 20.008041381835938, "learning_rate": 3.3239224543572374e-06, "loss": 0.7427, "step": 22150 }, { "epoch": 4.17090156220591, "grad_norm": 21.009740829467773, "learning_rate": 3.31639375117636e-06, "loss": 0.3218, "step": 22160 }, { "epoch": 4.1727837380011294, "grad_norm": 31.26310920715332, "learning_rate": 3.3088650479954832e-06, "loss": 0.593, "step": 22170 }, { "epoch": 4.174665913796349, "grad_norm": 0.45123204588890076, "learning_rate": 3.3013363448146057e-06, "loss": 0.4274, "step": 22180 }, { "epoch": 4.176548089591567, "grad_norm": 16.512317657470703, "learning_rate": 3.293807641633729e-06, "loss": 0.7006, "step": 22190 }, { "epoch": 4.178430265386787, "grad_norm": 7.592436790466309, "learning_rate": 3.2862789384528516e-06, "loss": 0.5583, "step": 22200 }, { "epoch": 4.180312441182006, "grad_norm": 19.937170028686523, "learning_rate": 3.2787502352719745e-06, "loss": 0.8125, "step": 22210 }, { "epoch": 4.182194616977226, "grad_norm": 24.68047332763672, "learning_rate": 3.271221532091098e-06, "loss": 0.6438, "step": 22220 }, { "epoch": 4.184076792772445, "grad_norm": 16.7028865814209, "learning_rate": 3.2636928289102203e-06, "loss": 0.3172, "step": 22230 }, { "epoch": 4.185958968567665, "grad_norm": 31.273239135742188, "learning_rate": 3.2561641257293437e-06, "loss": 0.413, "step": 22240 }, { "epoch": 4.187841144362883, "grad_norm": 7.516437530517578, "learning_rate": 3.248635422548466e-06, "loss": 0.5438, "step": 22250 }, { "epoch": 4.189723320158103, "grad_norm": 12.695963859558105, "learning_rate": 3.2411067193675895e-06, "loss": 0.7824, "step": 22260 }, { "epoch": 4.191605495953322, "grad_norm": 0.8344699144363403, "learning_rate": 3.233578016186712e-06, "loss": 0.5172, "step": 22270 }, { "epoch": 4.193487671748541, "grad_norm": 7.483529090881348, "learning_rate": 3.226049313005835e-06, "loss": 0.5373, "step": 22280 }, { "epoch": 4.195369847543761, "grad_norm": 0.5889711380004883, "learning_rate": 3.218520609824958e-06, "loss": 0.5973, "step": 22290 }, { "epoch": 4.19725202333898, "grad_norm": 13.00317096710205, "learning_rate": 3.210991906644081e-06, "loss": 0.5395, "step": 22300 }, { "epoch": 4.199134199134199, "grad_norm": 22.1431827545166, "learning_rate": 3.2034632034632033e-06, "loss": 0.7073, "step": 22310 }, { "epoch": 4.201016374929418, "grad_norm": 24.43523406982422, "learning_rate": 3.1959345002823266e-06, "loss": 0.9535, "step": 22320 }, { "epoch": 4.202898550724638, "grad_norm": 0.73313969373703, "learning_rate": 3.188405797101449e-06, "loss": 0.6103, "step": 22330 }, { "epoch": 4.204780726519857, "grad_norm": 17.1301212310791, "learning_rate": 3.1808770939205725e-06, "loss": 0.5049, "step": 22340 }, { "epoch": 4.206662902315077, "grad_norm": 19.849651336669922, "learning_rate": 3.1733483907396954e-06, "loss": 0.8419, "step": 22350 }, { "epoch": 4.208545078110295, "grad_norm": 7.085902690887451, "learning_rate": 3.1658196875588183e-06, "loss": 0.7864, "step": 22360 }, { "epoch": 4.2104272539055145, "grad_norm": 0.4933435916900635, "learning_rate": 3.1582909843779412e-06, "loss": 0.5251, "step": 22370 }, { "epoch": 4.212309429700734, "grad_norm": 16.553647994995117, "learning_rate": 3.1507622811970637e-06, "loss": 0.6869, "step": 22380 }, { "epoch": 4.214191605495953, "grad_norm": 5.6313018798828125, "learning_rate": 3.143233578016187e-06, "loss": 0.4803, "step": 22390 }, { "epoch": 4.216073781291173, "grad_norm": 20.206897735595703, "learning_rate": 3.1357048748353096e-06, "loss": 0.6774, "step": 22400 }, { "epoch": 4.217955957086392, "grad_norm": 12.4583158493042, "learning_rate": 3.128176171654433e-06, "loss": 0.6524, "step": 22410 }, { "epoch": 4.219838132881611, "grad_norm": 11.112521171569824, "learning_rate": 3.1206474684735554e-06, "loss": 0.5888, "step": 22420 }, { "epoch": 4.22172030867683, "grad_norm": 25.20332145690918, "learning_rate": 3.1131187652926788e-06, "loss": 0.4639, "step": 22430 }, { "epoch": 4.22360248447205, "grad_norm": 13.431328773498535, "learning_rate": 3.1055900621118013e-06, "loss": 0.6233, "step": 22440 }, { "epoch": 4.225484660267269, "grad_norm": 25.92032241821289, "learning_rate": 3.0980613589309246e-06, "loss": 1.0161, "step": 22450 }, { "epoch": 4.2273668360624885, "grad_norm": 8.30104923248291, "learning_rate": 3.0905326557500475e-06, "loss": 0.7447, "step": 22460 }, { "epoch": 4.229249011857707, "grad_norm": 2.044919967651367, "learning_rate": 3.08300395256917e-06, "loss": 0.5459, "step": 22470 }, { "epoch": 4.2311311876529265, "grad_norm": 6.667474746704102, "learning_rate": 3.0754752493882934e-06, "loss": 0.8596, "step": 22480 }, { "epoch": 4.233013363448146, "grad_norm": 4.499909400939941, "learning_rate": 3.067946546207416e-06, "loss": 0.3851, "step": 22490 }, { "epoch": 4.234895539243365, "grad_norm": 12.6906099319458, "learning_rate": 3.0604178430265392e-06, "loss": 0.5335, "step": 22500 }, { "epoch": 4.236777715038585, "grad_norm": 6.795680046081543, "learning_rate": 3.0528891398456617e-06, "loss": 0.4769, "step": 22510 }, { "epoch": 4.238659890833804, "grad_norm": 22.774063110351562, "learning_rate": 3.045360436664785e-06, "loss": 0.453, "step": 22520 }, { "epoch": 4.240542066629023, "grad_norm": 14.406441688537598, "learning_rate": 3.0378317334839076e-06, "loss": 0.67, "step": 22530 }, { "epoch": 4.242424242424242, "grad_norm": 13.296455383300781, "learning_rate": 3.0303030303030305e-06, "loss": 0.4151, "step": 22540 }, { "epoch": 4.244306418219462, "grad_norm": 15.775333404541016, "learning_rate": 3.0227743271221534e-06, "loss": 0.5242, "step": 22550 }, { "epoch": 4.246188594014681, "grad_norm": 14.018509864807129, "learning_rate": 3.0152456239412763e-06, "loss": 0.3969, "step": 22560 }, { "epoch": 4.2480707698099005, "grad_norm": 13.857187271118164, "learning_rate": 3.007716920760399e-06, "loss": 0.5807, "step": 22570 }, { "epoch": 4.24995294560512, "grad_norm": 4.365860462188721, "learning_rate": 3.000188217579522e-06, "loss": 0.7116, "step": 22580 }, { "epoch": 4.2518351214003385, "grad_norm": 1.263258934020996, "learning_rate": 2.9926595143986455e-06, "loss": 0.4453, "step": 22590 }, { "epoch": 4.253717297195558, "grad_norm": 11.76640796661377, "learning_rate": 2.985130811217768e-06, "loss": 0.6742, "step": 22600 }, { "epoch": 4.255599472990777, "grad_norm": 30.159893035888672, "learning_rate": 2.977602108036891e-06, "loss": 0.7244, "step": 22610 }, { "epoch": 4.257481648785997, "grad_norm": 14.70153522491455, "learning_rate": 2.970073404856014e-06, "loss": 0.4377, "step": 22620 }, { "epoch": 4.259363824581216, "grad_norm": 0.953132688999176, "learning_rate": 2.9625447016751368e-06, "loss": 0.7653, "step": 22630 }, { "epoch": 4.261246000376435, "grad_norm": 32.616031646728516, "learning_rate": 2.9550159984942593e-06, "loss": 0.8886, "step": 22640 }, { "epoch": 4.263128176171654, "grad_norm": 16.00680160522461, "learning_rate": 2.9474872953133826e-06, "loss": 0.6501, "step": 22650 }, { "epoch": 4.265010351966874, "grad_norm": 18.027944564819336, "learning_rate": 2.939958592132505e-06, "loss": 0.748, "step": 22660 }, { "epoch": 4.266892527762093, "grad_norm": 15.885736465454102, "learning_rate": 2.9324298889516285e-06, "loss": 0.5954, "step": 22670 }, { "epoch": 4.2687747035573125, "grad_norm": 15.595780372619629, "learning_rate": 2.924901185770751e-06, "loss": 0.4439, "step": 22680 }, { "epoch": 4.270656879352532, "grad_norm": 14.558534622192383, "learning_rate": 2.9173724825898743e-06, "loss": 0.5055, "step": 22690 }, { "epoch": 4.2725390551477505, "grad_norm": 1.8835563659667969, "learning_rate": 2.9098437794089972e-06, "loss": 0.8219, "step": 22700 }, { "epoch": 4.27442123094297, "grad_norm": 11.67325210571289, "learning_rate": 2.9023150762281197e-06, "loss": 0.4953, "step": 22710 }, { "epoch": 4.276303406738189, "grad_norm": 5.809265613555908, "learning_rate": 2.894786373047243e-06, "loss": 0.5423, "step": 22720 }, { "epoch": 4.278185582533409, "grad_norm": 13.89829158782959, "learning_rate": 2.8872576698663656e-06, "loss": 0.6693, "step": 22730 }, { "epoch": 4.280067758328628, "grad_norm": 8.091310501098633, "learning_rate": 2.879728966685489e-06, "loss": 0.6786, "step": 22740 }, { "epoch": 4.281949934123848, "grad_norm": 18.016300201416016, "learning_rate": 2.8722002635046114e-06, "loss": 0.4909, "step": 22750 }, { "epoch": 4.283832109919066, "grad_norm": 0.6892550587654114, "learning_rate": 2.8646715603237348e-06, "loss": 0.5059, "step": 22760 }, { "epoch": 4.285714285714286, "grad_norm": 0.654849648475647, "learning_rate": 2.8571428571428573e-06, "loss": 0.3984, "step": 22770 }, { "epoch": 4.287596461509505, "grad_norm": 19.24492835998535, "learning_rate": 2.84961415396198e-06, "loss": 0.4691, "step": 22780 }, { "epoch": 4.2894786373047245, "grad_norm": 20.14637565612793, "learning_rate": 2.842085450781103e-06, "loss": 0.5974, "step": 22790 }, { "epoch": 4.291360813099944, "grad_norm": 12.756900787353516, "learning_rate": 2.834556747600226e-06, "loss": 0.4315, "step": 22800 }, { "epoch": 4.293242988895162, "grad_norm": 14.236254692077637, "learning_rate": 2.827028044419349e-06, "loss": 0.7167, "step": 22810 }, { "epoch": 4.295125164690382, "grad_norm": 17.23895263671875, "learning_rate": 2.819499341238472e-06, "loss": 0.6919, "step": 22820 }, { "epoch": 4.297007340485601, "grad_norm": 28.651317596435547, "learning_rate": 2.811970638057595e-06, "loss": 0.5164, "step": 22830 }, { "epoch": 4.298889516280821, "grad_norm": 1.3199594020843506, "learning_rate": 2.8044419348767177e-06, "loss": 0.5391, "step": 22840 }, { "epoch": 4.30077169207604, "grad_norm": 7.875750541687012, "learning_rate": 2.7969132316958406e-06, "loss": 0.5423, "step": 22850 }, { "epoch": 4.30265386787126, "grad_norm": 7.99800968170166, "learning_rate": 2.7893845285149635e-06, "loss": 0.67, "step": 22860 }, { "epoch": 4.304536043666478, "grad_norm": 28.95355224609375, "learning_rate": 2.7818558253340865e-06, "loss": 0.692, "step": 22870 }, { "epoch": 4.306418219461698, "grad_norm": 37.334197998046875, "learning_rate": 2.7743271221532094e-06, "loss": 0.485, "step": 22880 }, { "epoch": 4.308300395256917, "grad_norm": 9.309508323669434, "learning_rate": 2.7667984189723323e-06, "loss": 0.7451, "step": 22890 }, { "epoch": 4.310182571052136, "grad_norm": 21.458972930908203, "learning_rate": 2.759269715791455e-06, "loss": 0.4967, "step": 22900 }, { "epoch": 4.312064746847356, "grad_norm": 13.24349308013916, "learning_rate": 2.751741012610578e-06, "loss": 0.5979, "step": 22910 }, { "epoch": 4.313946922642575, "grad_norm": 43.8453254699707, "learning_rate": 2.7442123094297007e-06, "loss": 0.6058, "step": 22920 }, { "epoch": 4.315829098437794, "grad_norm": 8.151017189025879, "learning_rate": 2.736683606248824e-06, "loss": 0.6961, "step": 22930 }, { "epoch": 4.317711274233013, "grad_norm": 10.600540161132812, "learning_rate": 2.7291549030679465e-06, "loss": 0.7623, "step": 22940 }, { "epoch": 4.319593450028233, "grad_norm": 13.357274055480957, "learning_rate": 2.72162619988707e-06, "loss": 0.4591, "step": 22950 }, { "epoch": 4.321475625823452, "grad_norm": 11.977415084838867, "learning_rate": 2.7140974967061928e-06, "loss": 0.8256, "step": 22960 }, { "epoch": 4.323357801618672, "grad_norm": 5.431375026702881, "learning_rate": 2.7065687935253153e-06, "loss": 0.5577, "step": 22970 }, { "epoch": 4.32523997741389, "grad_norm": 0.1479095071554184, "learning_rate": 2.6990400903444386e-06, "loss": 0.6352, "step": 22980 }, { "epoch": 4.3271221532091095, "grad_norm": 29.114826202392578, "learning_rate": 2.691511387163561e-06, "loss": 0.6353, "step": 22990 }, { "epoch": 4.329004329004329, "grad_norm": 45.33976364135742, "learning_rate": 2.6839826839826844e-06, "loss": 0.6538, "step": 23000 }, { "epoch": 4.330886504799548, "grad_norm": 19.11794662475586, "learning_rate": 2.676453980801807e-06, "loss": 0.5803, "step": 23010 }, { "epoch": 4.332768680594768, "grad_norm": 30.116313934326172, "learning_rate": 2.6689252776209303e-06, "loss": 0.2622, "step": 23020 }, { "epoch": 4.334650856389986, "grad_norm": 19.199289321899414, "learning_rate": 2.661396574440053e-06, "loss": 0.506, "step": 23030 }, { "epoch": 4.336533032185206, "grad_norm": 13.003825187683105, "learning_rate": 2.6538678712591757e-06, "loss": 0.5173, "step": 23040 }, { "epoch": 4.338415207980425, "grad_norm": 18.61155891418457, "learning_rate": 2.6463391680782986e-06, "loss": 0.4607, "step": 23050 }, { "epoch": 4.340297383775645, "grad_norm": 25.670454025268555, "learning_rate": 2.6388104648974216e-06, "loss": 0.6261, "step": 23060 }, { "epoch": 4.342179559570864, "grad_norm": 10.272873878479004, "learning_rate": 2.631281761716545e-06, "loss": 0.6437, "step": 23070 }, { "epoch": 4.3440617353660835, "grad_norm": 0.25827351212501526, "learning_rate": 2.6237530585356674e-06, "loss": 0.7672, "step": 23080 }, { "epoch": 4.345943911161302, "grad_norm": 3.8775696754455566, "learning_rate": 2.6162243553547907e-06, "loss": 0.3559, "step": 23090 }, { "epoch": 4.3478260869565215, "grad_norm": 0.6631990075111389, "learning_rate": 2.6086956521739132e-06, "loss": 0.6067, "step": 23100 }, { "epoch": 4.349708262751741, "grad_norm": 7.079066276550293, "learning_rate": 2.601166948993036e-06, "loss": 0.6171, "step": 23110 }, { "epoch": 4.35159043854696, "grad_norm": 21.724056243896484, "learning_rate": 2.593638245812159e-06, "loss": 0.9693, "step": 23120 }, { "epoch": 4.35347261434218, "grad_norm": 3.3402185440063477, "learning_rate": 2.586109542631282e-06, "loss": 0.6827, "step": 23130 }, { "epoch": 4.355354790137399, "grad_norm": 13.261208534240723, "learning_rate": 2.578580839450405e-06, "loss": 0.4732, "step": 23140 }, { "epoch": 4.357236965932618, "grad_norm": 48.314605712890625, "learning_rate": 2.571052136269528e-06, "loss": 0.5729, "step": 23150 }, { "epoch": 4.359119141727837, "grad_norm": 2.508441925048828, "learning_rate": 2.5635234330886503e-06, "loss": 0.5245, "step": 23160 }, { "epoch": 4.361001317523057, "grad_norm": 18.48830223083496, "learning_rate": 2.5559947299077737e-06, "loss": 0.6878, "step": 23170 }, { "epoch": 4.362883493318276, "grad_norm": 0.715785801410675, "learning_rate": 2.548466026726896e-06, "loss": 0.6787, "step": 23180 }, { "epoch": 4.3647656691134955, "grad_norm": 30.262195587158203, "learning_rate": 2.5409373235460195e-06, "loss": 0.6445, "step": 23190 }, { "epoch": 4.366647844908714, "grad_norm": 5.811047554016113, "learning_rate": 2.5334086203651425e-06, "loss": 0.4486, "step": 23200 }, { "epoch": 4.3685300207039335, "grad_norm": 10.137606620788574, "learning_rate": 2.5258799171842654e-06, "loss": 0.8768, "step": 23210 }, { "epoch": 4.370412196499153, "grad_norm": 11.782840728759766, "learning_rate": 2.5183512140033883e-06, "loss": 0.8652, "step": 23220 }, { "epoch": 4.372294372294372, "grad_norm": 2.407595634460449, "learning_rate": 2.510822510822511e-06, "loss": 0.4058, "step": 23230 }, { "epoch": 4.374176548089592, "grad_norm": 14.99952220916748, "learning_rate": 2.503293807641634e-06, "loss": 0.5289, "step": 23240 }, { "epoch": 4.376058723884811, "grad_norm": 15.092422485351562, "learning_rate": 2.4957651044607566e-06, "loss": 0.6724, "step": 23250 }, { "epoch": 4.37794089968003, "grad_norm": 56.52970886230469, "learning_rate": 2.48823640127988e-06, "loss": 0.53, "step": 23260 }, { "epoch": 4.379823075475249, "grad_norm": 16.378936767578125, "learning_rate": 2.480707698099003e-06, "loss": 0.3907, "step": 23270 }, { "epoch": 4.381705251270469, "grad_norm": 13.952378273010254, "learning_rate": 2.473178994918126e-06, "loss": 0.401, "step": 23280 }, { "epoch": 4.383587427065688, "grad_norm": 5.490935802459717, "learning_rate": 2.4656502917372483e-06, "loss": 0.6424, "step": 23290 }, { "epoch": 4.3854696028609075, "grad_norm": 37.093711853027344, "learning_rate": 2.4581215885563712e-06, "loss": 0.5544, "step": 23300 }, { "epoch": 4.387351778656127, "grad_norm": 18.15984535217285, "learning_rate": 2.450592885375494e-06, "loss": 0.6462, "step": 23310 }, { "epoch": 4.3892339544513455, "grad_norm": 13.298816680908203, "learning_rate": 2.443064182194617e-06, "loss": 0.6277, "step": 23320 }, { "epoch": 4.391116130246565, "grad_norm": 11.613605499267578, "learning_rate": 2.43553547901374e-06, "loss": 0.5823, "step": 23330 }, { "epoch": 4.392998306041784, "grad_norm": 1.215378999710083, "learning_rate": 2.428006775832863e-06, "loss": 0.6063, "step": 23340 }, { "epoch": 4.394880481837004, "grad_norm": 6.986719608306885, "learning_rate": 2.420478072651986e-06, "loss": 0.4439, "step": 23350 }, { "epoch": 4.396762657632223, "grad_norm": 11.11677360534668, "learning_rate": 2.4129493694711088e-06, "loss": 0.4192, "step": 23360 }, { "epoch": 4.398644833427442, "grad_norm": 36.418296813964844, "learning_rate": 2.4054206662902317e-06, "loss": 0.5583, "step": 23370 }, { "epoch": 4.400527009222661, "grad_norm": 9.455841064453125, "learning_rate": 2.3978919631093546e-06, "loss": 0.592, "step": 23380 }, { "epoch": 4.402409185017881, "grad_norm": 0.7446449398994446, "learning_rate": 2.3903632599284775e-06, "loss": 0.8016, "step": 23390 }, { "epoch": 4.4042913608131, "grad_norm": 19.491920471191406, "learning_rate": 2.3828345567476005e-06, "loss": 0.4662, "step": 23400 }, { "epoch": 4.4061735366083195, "grad_norm": 22.67009735107422, "learning_rate": 2.3753058535667234e-06, "loss": 0.3276, "step": 23410 }, { "epoch": 4.408055712403539, "grad_norm": 22.89016342163086, "learning_rate": 2.3677771503858463e-06, "loss": 0.5626, "step": 23420 }, { "epoch": 4.409937888198757, "grad_norm": 0.8171205520629883, "learning_rate": 2.3602484472049692e-06, "loss": 0.5954, "step": 23430 }, { "epoch": 4.411820063993977, "grad_norm": 15.351773262023926, "learning_rate": 2.352719744024092e-06, "loss": 0.5399, "step": 23440 }, { "epoch": 4.413702239789196, "grad_norm": 30.926944732666016, "learning_rate": 2.345191040843215e-06, "loss": 0.8775, "step": 23450 }, { "epoch": 4.415584415584416, "grad_norm": 1.1383413076400757, "learning_rate": 2.337662337662338e-06, "loss": 0.6176, "step": 23460 }, { "epoch": 4.417466591379635, "grad_norm": 13.357398986816406, "learning_rate": 2.3301336344814605e-06, "loss": 0.6766, "step": 23470 }, { "epoch": 4.419348767174855, "grad_norm": 32.312835693359375, "learning_rate": 2.3226049313005834e-06, "loss": 0.4389, "step": 23480 }, { "epoch": 4.421230942970073, "grad_norm": 11.716819763183594, "learning_rate": 2.3150762281197063e-06, "loss": 0.4396, "step": 23490 }, { "epoch": 4.423113118765293, "grad_norm": 5.9410576820373535, "learning_rate": 2.3075475249388297e-06, "loss": 0.4957, "step": 23500 }, { "epoch": 4.424995294560512, "grad_norm": 13.58919906616211, "learning_rate": 2.3000188217579526e-06, "loss": 0.7802, "step": 23510 }, { "epoch": 4.426877470355731, "grad_norm": 18.13483428955078, "learning_rate": 2.2924901185770755e-06, "loss": 0.7189, "step": 23520 }, { "epoch": 4.428759646150951, "grad_norm": 5.589889049530029, "learning_rate": 2.2849614153961984e-06, "loss": 0.4137, "step": 23530 }, { "epoch": 4.430641821946169, "grad_norm": 13.841676712036133, "learning_rate": 2.277432712215321e-06, "loss": 0.6273, "step": 23540 }, { "epoch": 4.432523997741389, "grad_norm": 0.4990882873535156, "learning_rate": 2.269904009034444e-06, "loss": 0.4074, "step": 23550 }, { "epoch": 4.434406173536608, "grad_norm": 8.081299781799316, "learning_rate": 2.2623753058535668e-06, "loss": 0.5185, "step": 23560 }, { "epoch": 4.436288349331828, "grad_norm": 3.278000831604004, "learning_rate": 2.2548466026726897e-06, "loss": 0.6386, "step": 23570 }, { "epoch": 4.438170525127047, "grad_norm": 21.62383270263672, "learning_rate": 2.2473178994918126e-06, "loss": 0.5482, "step": 23580 }, { "epoch": 4.440052700922266, "grad_norm": 44.334747314453125, "learning_rate": 2.2397891963109355e-06, "loss": 0.576, "step": 23590 }, { "epoch": 4.441934876717485, "grad_norm": 13.59565258026123, "learning_rate": 2.2322604931300585e-06, "loss": 0.5771, "step": 23600 }, { "epoch": 4.4438170525127045, "grad_norm": 19.645078659057617, "learning_rate": 2.2247317899491814e-06, "loss": 0.7014, "step": 23610 }, { "epoch": 4.445699228307924, "grad_norm": 7.434754371643066, "learning_rate": 2.2172030867683043e-06, "loss": 0.4733, "step": 23620 }, { "epoch": 4.447581404103143, "grad_norm": 49.53765869140625, "learning_rate": 2.2096743835874272e-06, "loss": 0.7781, "step": 23630 }, { "epoch": 4.449463579898363, "grad_norm": 2.3648738861083984, "learning_rate": 2.20214568040655e-06, "loss": 0.6185, "step": 23640 }, { "epoch": 4.451345755693581, "grad_norm": 9.881647109985352, "learning_rate": 2.194616977225673e-06, "loss": 0.7099, "step": 23650 }, { "epoch": 4.453227931488801, "grad_norm": 15.192410469055176, "learning_rate": 2.187088274044796e-06, "loss": 0.5284, "step": 23660 }, { "epoch": 4.45511010728402, "grad_norm": 0.09341107308864594, "learning_rate": 2.179559570863919e-06, "loss": 0.2563, "step": 23670 }, { "epoch": 4.45699228307924, "grad_norm": 17.3346004486084, "learning_rate": 2.172030867683042e-06, "loss": 0.5827, "step": 23680 }, { "epoch": 4.458874458874459, "grad_norm": 12.81985855102539, "learning_rate": 2.1645021645021648e-06, "loss": 0.476, "step": 23690 }, { "epoch": 4.4607566346696785, "grad_norm": 11.144566535949707, "learning_rate": 2.1569734613212877e-06, "loss": 0.4212, "step": 23700 }, { "epoch": 4.462638810464897, "grad_norm": 30.27758026123047, "learning_rate": 2.1494447581404106e-06, "loss": 0.5779, "step": 23710 }, { "epoch": 4.4645209862601165, "grad_norm": 29.81888771057129, "learning_rate": 2.1419160549595335e-06, "loss": 0.7226, "step": 23720 }, { "epoch": 4.466403162055336, "grad_norm": 0.8965347409248352, "learning_rate": 2.134387351778656e-06, "loss": 0.7259, "step": 23730 }, { "epoch": 4.468285337850555, "grad_norm": 12.240394592285156, "learning_rate": 2.126858648597779e-06, "loss": 0.3844, "step": 23740 }, { "epoch": 4.470167513645775, "grad_norm": 1.0969513654708862, "learning_rate": 2.1193299454169023e-06, "loss": 0.4925, "step": 23750 }, { "epoch": 4.472049689440993, "grad_norm": 6.099503040313721, "learning_rate": 2.111801242236025e-06, "loss": 0.4466, "step": 23760 }, { "epoch": 4.473931865236213, "grad_norm": 15.072311401367188, "learning_rate": 2.104272539055148e-06, "loss": 0.7378, "step": 23770 }, { "epoch": 4.475814041031432, "grad_norm": 38.78789520263672, "learning_rate": 2.096743835874271e-06, "loss": 0.7411, "step": 23780 }, { "epoch": 4.477696216826652, "grad_norm": 3.0044000148773193, "learning_rate": 2.089215132693394e-06, "loss": 0.3366, "step": 23790 }, { "epoch": 4.479578392621871, "grad_norm": 17.955310821533203, "learning_rate": 2.0816864295125165e-06, "loss": 0.5898, "step": 23800 }, { "epoch": 4.4814605684170905, "grad_norm": 7.283894062042236, "learning_rate": 2.0741577263316394e-06, "loss": 0.6196, "step": 23810 }, { "epoch": 4.483342744212309, "grad_norm": 20.742965698242188, "learning_rate": 2.0666290231507623e-06, "loss": 0.563, "step": 23820 }, { "epoch": 4.4852249200075285, "grad_norm": 0.6865695714950562, "learning_rate": 2.0591003199698852e-06, "loss": 0.3824, "step": 23830 }, { "epoch": 4.487107095802748, "grad_norm": 18.67192268371582, "learning_rate": 2.051571616789008e-06, "loss": 0.63, "step": 23840 }, { "epoch": 4.488989271597967, "grad_norm": 5.289563179016113, "learning_rate": 2.044042913608131e-06, "loss": 0.6516, "step": 23850 }, { "epoch": 4.490871447393187, "grad_norm": 15.691003799438477, "learning_rate": 2.036514210427254e-06, "loss": 0.4024, "step": 23860 }, { "epoch": 4.492753623188406, "grad_norm": 22.759946823120117, "learning_rate": 2.028985507246377e-06, "loss": 0.9504, "step": 23870 }, { "epoch": 4.494635798983625, "grad_norm": 1.0771783590316772, "learning_rate": 2.0214568040655e-06, "loss": 0.4739, "step": 23880 }, { "epoch": 4.496517974778844, "grad_norm": 4.770224094390869, "learning_rate": 2.0139281008846228e-06, "loss": 0.8352, "step": 23890 }, { "epoch": 4.498400150574064, "grad_norm": 13.002606391906738, "learning_rate": 2.0063993977037457e-06, "loss": 0.4923, "step": 23900 }, { "epoch": 4.500282326369283, "grad_norm": 5.516331672668457, "learning_rate": 1.9988706945228686e-06, "loss": 0.4354, "step": 23910 }, { "epoch": 4.5021645021645025, "grad_norm": 16.686582565307617, "learning_rate": 1.9913419913419915e-06, "loss": 0.638, "step": 23920 }, { "epoch": 4.504046677959721, "grad_norm": 25.16171646118164, "learning_rate": 1.9838132881611144e-06, "loss": 0.649, "step": 23930 }, { "epoch": 4.5059288537549405, "grad_norm": 22.84310531616211, "learning_rate": 1.9762845849802374e-06, "loss": 0.7048, "step": 23940 }, { "epoch": 4.50781102955016, "grad_norm": 11.860065460205078, "learning_rate": 1.9687558817993603e-06, "loss": 0.8613, "step": 23950 }, { "epoch": 4.509693205345379, "grad_norm": 8.635574340820312, "learning_rate": 1.961227178618483e-06, "loss": 0.5576, "step": 23960 }, { "epoch": 4.511575381140599, "grad_norm": 11.803613662719727, "learning_rate": 1.953698475437606e-06, "loss": 0.3823, "step": 23970 }, { "epoch": 4.513457556935818, "grad_norm": 4.785434246063232, "learning_rate": 1.9461697722567286e-06, "loss": 0.9674, "step": 23980 }, { "epoch": 4.515339732731037, "grad_norm": 135.4927215576172, "learning_rate": 1.938641069075852e-06, "loss": 0.7317, "step": 23990 }, { "epoch": 4.517221908526256, "grad_norm": 11.872318267822266, "learning_rate": 1.931112365894975e-06, "loss": 0.485, "step": 24000 }, { "epoch": 4.519104084321476, "grad_norm": 0.5789291858673096, "learning_rate": 1.923583662714098e-06, "loss": 0.5606, "step": 24010 }, { "epoch": 4.520986260116695, "grad_norm": 7.3305439949035645, "learning_rate": 1.9160549595332207e-06, "loss": 0.6314, "step": 24020 }, { "epoch": 4.5228684359119145, "grad_norm": 13.919968605041504, "learning_rate": 1.9085262563523437e-06, "loss": 0.6887, "step": 24030 }, { "epoch": 4.524750611707134, "grad_norm": 1.8676209449768066, "learning_rate": 1.9009975531714664e-06, "loss": 0.5487, "step": 24040 }, { "epoch": 4.526632787502352, "grad_norm": 22.899232864379883, "learning_rate": 1.8934688499905893e-06, "loss": 0.7641, "step": 24050 }, { "epoch": 4.528514963297572, "grad_norm": 5.90631103515625, "learning_rate": 1.8859401468097122e-06, "loss": 0.7595, "step": 24060 }, { "epoch": 4.530397139092791, "grad_norm": 9.364104270935059, "learning_rate": 1.8784114436288351e-06, "loss": 0.2766, "step": 24070 }, { "epoch": 4.532279314888011, "grad_norm": 57.38972854614258, "learning_rate": 1.8708827404479578e-06, "loss": 0.6298, "step": 24080 }, { "epoch": 4.53416149068323, "grad_norm": 0.9203323125839233, "learning_rate": 1.8633540372670808e-06, "loss": 0.6828, "step": 24090 }, { "epoch": 4.536043666478449, "grad_norm": 16.27019500732422, "learning_rate": 1.8558253340862037e-06, "loss": 0.8687, "step": 24100 }, { "epoch": 4.537925842273668, "grad_norm": 10.179085731506348, "learning_rate": 1.8482966309053266e-06, "loss": 0.5654, "step": 24110 }, { "epoch": 4.539808018068888, "grad_norm": 31.10147476196289, "learning_rate": 1.8407679277244497e-06, "loss": 0.6024, "step": 24120 }, { "epoch": 4.541690193864107, "grad_norm": 5.746549606323242, "learning_rate": 1.8332392245435727e-06, "loss": 0.6073, "step": 24130 }, { "epoch": 4.543572369659326, "grad_norm": 18.98149299621582, "learning_rate": 1.8257105213626956e-06, "loss": 0.6642, "step": 24140 }, { "epoch": 4.545454545454545, "grad_norm": 11.173285484313965, "learning_rate": 1.8181818181818183e-06, "loss": 0.4425, "step": 24150 }, { "epoch": 4.547336721249764, "grad_norm": 68.92263793945312, "learning_rate": 1.8106531150009412e-06, "loss": 0.4738, "step": 24160 }, { "epoch": 4.549218897044984, "grad_norm": 17.539867401123047, "learning_rate": 1.8031244118200641e-06, "loss": 0.6192, "step": 24170 }, { "epoch": 4.551101072840203, "grad_norm": 22.743282318115234, "learning_rate": 1.795595708639187e-06, "loss": 0.7029, "step": 24180 }, { "epoch": 4.552983248635423, "grad_norm": 14.960015296936035, "learning_rate": 1.78806700545831e-06, "loss": 0.3663, "step": 24190 }, { "epoch": 4.554865424430642, "grad_norm": 16.499238967895508, "learning_rate": 1.780538302277433e-06, "loss": 0.5681, "step": 24200 }, { "epoch": 4.556747600225862, "grad_norm": 7.2052388191223145, "learning_rate": 1.7730095990965556e-06, "loss": 0.3522, "step": 24210 }, { "epoch": 4.55862977602108, "grad_norm": 10.154321670532227, "learning_rate": 1.7654808959156785e-06, "loss": 0.6202, "step": 24220 }, { "epoch": 4.5605119518162995, "grad_norm": 3.8168325424194336, "learning_rate": 1.7579521927348015e-06, "loss": 0.5148, "step": 24230 }, { "epoch": 4.562394127611519, "grad_norm": 5.976160526275635, "learning_rate": 1.7504234895539246e-06, "loss": 0.6931, "step": 24240 }, { "epoch": 4.564276303406738, "grad_norm": 10.809183120727539, "learning_rate": 1.7428947863730475e-06, "loss": 0.6397, "step": 24250 }, { "epoch": 4.566158479201958, "grad_norm": 6.282565116882324, "learning_rate": 1.7353660831921704e-06, "loss": 0.4463, "step": 24260 }, { "epoch": 4.568040654997176, "grad_norm": 1.7869082689285278, "learning_rate": 1.7278373800112933e-06, "loss": 0.3967, "step": 24270 }, { "epoch": 4.569922830792396, "grad_norm": 7.395685195922852, "learning_rate": 1.720308676830416e-06, "loss": 0.5906, "step": 24280 }, { "epoch": 4.571805006587615, "grad_norm": 14.580101013183594, "learning_rate": 1.712779973649539e-06, "loss": 0.5194, "step": 24290 }, { "epoch": 4.573687182382835, "grad_norm": 29.10459327697754, "learning_rate": 1.705251270468662e-06, "loss": 0.5974, "step": 24300 }, { "epoch": 4.575569358178054, "grad_norm": 15.645312309265137, "learning_rate": 1.6977225672877848e-06, "loss": 0.5314, "step": 24310 }, { "epoch": 4.577451533973273, "grad_norm": 20.30552101135254, "learning_rate": 1.6901938641069077e-06, "loss": 0.557, "step": 24320 }, { "epoch": 4.579333709768492, "grad_norm": 0.3066312074661255, "learning_rate": 1.6826651609260305e-06, "loss": 0.4308, "step": 24330 }, { "epoch": 4.5812158855637115, "grad_norm": 17.451622009277344, "learning_rate": 1.6751364577451534e-06, "loss": 0.2231, "step": 24340 }, { "epoch": 4.583098061358931, "grad_norm": 20.441179275512695, "learning_rate": 1.6676077545642763e-06, "loss": 0.4294, "step": 24350 }, { "epoch": 4.58498023715415, "grad_norm": 25.281518936157227, "learning_rate": 1.6600790513833994e-06, "loss": 0.4772, "step": 24360 }, { "epoch": 4.58686241294937, "grad_norm": 21.41700553894043, "learning_rate": 1.6525503482025224e-06, "loss": 0.7507, "step": 24370 }, { "epoch": 4.588744588744589, "grad_norm": 10.977895736694336, "learning_rate": 1.6450216450216453e-06, "loss": 0.4713, "step": 24380 }, { "epoch": 4.590626764539808, "grad_norm": 2.4768362045288086, "learning_rate": 1.6374929418407682e-06, "loss": 0.5988, "step": 24390 }, { "epoch": 4.592508940335027, "grad_norm": 11.520479202270508, "learning_rate": 1.629964238659891e-06, "loss": 0.6857, "step": 24400 }, { "epoch": 4.594391116130247, "grad_norm": 14.980639457702637, "learning_rate": 1.6224355354790138e-06, "loss": 0.7079, "step": 24410 }, { "epoch": 4.596273291925466, "grad_norm": 15.762008666992188, "learning_rate": 1.6149068322981367e-06, "loss": 0.4411, "step": 24420 }, { "epoch": 4.5981554677206855, "grad_norm": 19.70981216430664, "learning_rate": 1.6073781291172597e-06, "loss": 0.8163, "step": 24430 }, { "epoch": 4.600037643515904, "grad_norm": 21.06952667236328, "learning_rate": 1.5998494259363826e-06, "loss": 0.4246, "step": 24440 }, { "epoch": 4.6019198193111235, "grad_norm": 12.949865341186523, "learning_rate": 1.5923207227555055e-06, "loss": 0.4607, "step": 24450 }, { "epoch": 4.603801995106343, "grad_norm": 5.208014965057373, "learning_rate": 1.5847920195746282e-06, "loss": 0.6787, "step": 24460 }, { "epoch": 4.605684170901562, "grad_norm": 15.924193382263184, "learning_rate": 1.5772633163937511e-06, "loss": 0.5571, "step": 24470 }, { "epoch": 4.607566346696782, "grad_norm": 5.009922981262207, "learning_rate": 1.5697346132128743e-06, "loss": 0.7348, "step": 24480 }, { "epoch": 4.609448522492, "grad_norm": 6.933772563934326, "learning_rate": 1.5622059100319972e-06, "loss": 0.4897, "step": 24490 }, { "epoch": 4.61133069828722, "grad_norm": 12.554123878479004, "learning_rate": 1.5546772068511201e-06, "loss": 0.5164, "step": 24500 }, { "epoch": 4.613212874082439, "grad_norm": 30.179964065551758, "learning_rate": 1.547148503670243e-06, "loss": 0.4959, "step": 24510 }, { "epoch": 4.615095049877659, "grad_norm": 16.29331398010254, "learning_rate": 1.539619800489366e-06, "loss": 0.9734, "step": 24520 }, { "epoch": 4.616977225672878, "grad_norm": 11.80064868927002, "learning_rate": 1.5320910973084887e-06, "loss": 0.4734, "step": 24530 }, { "epoch": 4.6188594014680975, "grad_norm": 22.42782974243164, "learning_rate": 1.5245623941276116e-06, "loss": 0.4877, "step": 24540 }, { "epoch": 4.620741577263316, "grad_norm": 0.34393635392189026, "learning_rate": 1.5170336909467345e-06, "loss": 0.4987, "step": 24550 }, { "epoch": 4.6226237530585355, "grad_norm": 0.2390964925289154, "learning_rate": 1.5095049877658574e-06, "loss": 0.4389, "step": 24560 }, { "epoch": 4.624505928853755, "grad_norm": 9.884252548217773, "learning_rate": 1.5019762845849804e-06, "loss": 0.698, "step": 24570 }, { "epoch": 4.626388104648974, "grad_norm": 9.831954002380371, "learning_rate": 1.4944475814041033e-06, "loss": 0.5781, "step": 24580 }, { "epoch": 4.628270280444194, "grad_norm": 30.68934440612793, "learning_rate": 1.486918878223226e-06, "loss": 0.5255, "step": 24590 }, { "epoch": 4.630152456239413, "grad_norm": 10.3611478805542, "learning_rate": 1.479390175042349e-06, "loss": 0.807, "step": 24600 }, { "epoch": 4.632034632034632, "grad_norm": 9.771163940429688, "learning_rate": 1.471861471861472e-06, "loss": 0.4991, "step": 24610 }, { "epoch": 4.633916807829851, "grad_norm": 11.571956634521484, "learning_rate": 1.464332768680595e-06, "loss": 0.4391, "step": 24620 }, { "epoch": 4.635798983625071, "grad_norm": 1.4063133001327515, "learning_rate": 1.4568040654997179e-06, "loss": 0.6051, "step": 24630 }, { "epoch": 4.63768115942029, "grad_norm": 4.350733757019043, "learning_rate": 1.4492753623188408e-06, "loss": 0.5334, "step": 24640 }, { "epoch": 4.6395633352155095, "grad_norm": 21.370582580566406, "learning_rate": 1.4417466591379637e-06, "loss": 0.4807, "step": 24650 }, { "epoch": 4.641445511010728, "grad_norm": 57.6276969909668, "learning_rate": 1.4342179559570864e-06, "loss": 0.752, "step": 24660 }, { "epoch": 4.643327686805947, "grad_norm": 5.320174217224121, "learning_rate": 1.4266892527762094e-06, "loss": 0.6826, "step": 24670 }, { "epoch": 4.645209862601167, "grad_norm": 37.81419372558594, "learning_rate": 1.4191605495953323e-06, "loss": 0.8429, "step": 24680 }, { "epoch": 4.647092038396386, "grad_norm": 28.828960418701172, "learning_rate": 1.4116318464144552e-06, "loss": 0.7019, "step": 24690 }, { "epoch": 4.648974214191606, "grad_norm": 8.05349349975586, "learning_rate": 1.4041031432335781e-06, "loss": 0.6875, "step": 24700 }, { "epoch": 4.650856389986825, "grad_norm": 4.589570999145508, "learning_rate": 1.3965744400527008e-06, "loss": 0.5141, "step": 24710 }, { "epoch": 4.652738565782044, "grad_norm": 21.772384643554688, "learning_rate": 1.3890457368718238e-06, "loss": 0.6424, "step": 24720 }, { "epoch": 4.654620741577263, "grad_norm": 27.69976806640625, "learning_rate": 1.3815170336909469e-06, "loss": 0.4256, "step": 24730 }, { "epoch": 4.656502917372483, "grad_norm": 32.07310104370117, "learning_rate": 1.3739883305100698e-06, "loss": 0.4189, "step": 24740 }, { "epoch": 4.658385093167702, "grad_norm": 16.623987197875977, "learning_rate": 1.3664596273291927e-06, "loss": 0.5246, "step": 24750 }, { "epoch": 4.660267268962921, "grad_norm": 15.414529800415039, "learning_rate": 1.3589309241483157e-06, "loss": 0.7166, "step": 24760 }, { "epoch": 4.662149444758141, "grad_norm": 14.814608573913574, "learning_rate": 1.3514022209674386e-06, "loss": 0.8627, "step": 24770 }, { "epoch": 4.664031620553359, "grad_norm": 18.566692352294922, "learning_rate": 1.3438735177865615e-06, "loss": 0.7965, "step": 24780 }, { "epoch": 4.665913796348579, "grad_norm": 7.707961559295654, "learning_rate": 1.3363448146056842e-06, "loss": 0.5256, "step": 24790 }, { "epoch": 4.667795972143798, "grad_norm": 0.7956398725509644, "learning_rate": 1.3288161114248071e-06, "loss": 0.9007, "step": 24800 }, { "epoch": 4.669678147939018, "grad_norm": 25.115041732788086, "learning_rate": 1.32128740824393e-06, "loss": 0.6778, "step": 24810 }, { "epoch": 4.671560323734237, "grad_norm": 1.7890321016311646, "learning_rate": 1.313758705063053e-06, "loss": 0.9155, "step": 24820 }, { "epoch": 4.673442499529456, "grad_norm": 16.954694747924805, "learning_rate": 1.3062300018821759e-06, "loss": 0.6522, "step": 24830 }, { "epoch": 4.675324675324675, "grad_norm": 18.548112869262695, "learning_rate": 1.2987012987012986e-06, "loss": 0.3274, "step": 24840 }, { "epoch": 4.6772068511198945, "grad_norm": 19.337177276611328, "learning_rate": 1.291172595520422e-06, "loss": 0.5718, "step": 24850 }, { "epoch": 4.679089026915114, "grad_norm": 29.29849624633789, "learning_rate": 1.2836438923395447e-06, "loss": 0.6632, "step": 24860 }, { "epoch": 4.680971202710333, "grad_norm": 0.6499333381652832, "learning_rate": 1.2761151891586676e-06, "loss": 0.3925, "step": 24870 }, { "epoch": 4.682853378505552, "grad_norm": 10.113441467285156, "learning_rate": 1.2685864859777905e-06, "loss": 0.4654, "step": 24880 }, { "epoch": 4.684735554300771, "grad_norm": 0.6062495708465576, "learning_rate": 1.2610577827969134e-06, "loss": 0.6459, "step": 24890 }, { "epoch": 4.686617730095991, "grad_norm": 6.477812767028809, "learning_rate": 1.2535290796160363e-06, "loss": 0.4256, "step": 24900 }, { "epoch": 4.68849990589121, "grad_norm": 11.277091979980469, "learning_rate": 1.246000376435159e-06, "loss": 0.5037, "step": 24910 }, { "epoch": 4.69038208168643, "grad_norm": 24.859067916870117, "learning_rate": 1.238471673254282e-06, "loss": 0.6871, "step": 24920 }, { "epoch": 4.692264257481649, "grad_norm": 16.682817459106445, "learning_rate": 1.2309429700734049e-06, "loss": 0.4382, "step": 24930 }, { "epoch": 4.6941464332768685, "grad_norm": 15.542104721069336, "learning_rate": 1.223414266892528e-06, "loss": 0.4241, "step": 24940 }, { "epoch": 4.696028609072087, "grad_norm": 0.6870865225791931, "learning_rate": 1.2158855637116507e-06, "loss": 0.5352, "step": 24950 }, { "epoch": 4.6979107848673065, "grad_norm": 7.823055744171143, "learning_rate": 1.2083568605307737e-06, "loss": 0.9063, "step": 24960 }, { "epoch": 4.699792960662526, "grad_norm": 4.5665483474731445, "learning_rate": 1.2008281573498966e-06, "loss": 0.7457, "step": 24970 }, { "epoch": 4.701675136457745, "grad_norm": 1.2452117204666138, "learning_rate": 1.1932994541690195e-06, "loss": 0.6592, "step": 24980 }, { "epoch": 4.703557312252965, "grad_norm": 18.012990951538086, "learning_rate": 1.1857707509881424e-06, "loss": 0.401, "step": 24990 }, { "epoch": 4.705439488048183, "grad_norm": 7.86597204208374, "learning_rate": 1.1782420478072651e-06, "loss": 0.7094, "step": 25000 }, { "epoch": 4.707321663843403, "grad_norm": 32.99113845825195, "learning_rate": 1.1707133446263883e-06, "loss": 0.4918, "step": 25010 }, { "epoch": 4.709203839638622, "grad_norm": 33.427459716796875, "learning_rate": 1.1631846414455112e-06, "loss": 0.5377, "step": 25020 }, { "epoch": 4.711086015433842, "grad_norm": 0.6558060050010681, "learning_rate": 1.155655938264634e-06, "loss": 0.6283, "step": 25030 }, { "epoch": 4.712968191229061, "grad_norm": 0.2545570433139801, "learning_rate": 1.1481272350837568e-06, "loss": 0.4473, "step": 25040 }, { "epoch": 4.71485036702428, "grad_norm": 26.013124465942383, "learning_rate": 1.1405985319028797e-06, "loss": 0.6082, "step": 25050 }, { "epoch": 4.716732542819499, "grad_norm": 6.071534156799316, "learning_rate": 1.1330698287220027e-06, "loss": 0.5555, "step": 25060 }, { "epoch": 4.7186147186147185, "grad_norm": 24.877159118652344, "learning_rate": 1.1255411255411256e-06, "loss": 1.0349, "step": 25070 }, { "epoch": 4.720496894409938, "grad_norm": 7.250149250030518, "learning_rate": 1.1180124223602485e-06, "loss": 0.5758, "step": 25080 }, { "epoch": 4.722379070205157, "grad_norm": 2.08498215675354, "learning_rate": 1.1104837191793714e-06, "loss": 0.8396, "step": 25090 }, { "epoch": 4.724261246000377, "grad_norm": 12.615907669067383, "learning_rate": 1.1029550159984943e-06, "loss": 0.4626, "step": 25100 }, { "epoch": 4.726143421795595, "grad_norm": 24.194643020629883, "learning_rate": 1.0954263128176173e-06, "loss": 0.4744, "step": 25110 }, { "epoch": 4.728025597590815, "grad_norm": 5.291766166687012, "learning_rate": 1.0878976096367402e-06, "loss": 0.3882, "step": 25120 }, { "epoch": 4.729907773386034, "grad_norm": 20.6169376373291, "learning_rate": 1.0803689064558631e-06, "loss": 0.6649, "step": 25130 }, { "epoch": 4.731789949181254, "grad_norm": 18.83254623413086, "learning_rate": 1.072840203274986e-06, "loss": 0.4599, "step": 25140 }, { "epoch": 4.733672124976473, "grad_norm": 12.41308307647705, "learning_rate": 1.065311500094109e-06, "loss": 0.8391, "step": 25150 }, { "epoch": 4.7355543007716925, "grad_norm": 13.976269721984863, "learning_rate": 1.0577827969132319e-06, "loss": 0.5846, "step": 25160 }, { "epoch": 4.737436476566911, "grad_norm": 25.47661590576172, "learning_rate": 1.0502540937323546e-06, "loss": 0.604, "step": 25170 }, { "epoch": 4.7393186523621305, "grad_norm": 21.08745765686035, "learning_rate": 1.0427253905514775e-06, "loss": 0.4192, "step": 25180 }, { "epoch": 4.74120082815735, "grad_norm": 0.9242314696311951, "learning_rate": 1.0351966873706006e-06, "loss": 0.3425, "step": 25190 }, { "epoch": 4.743083003952569, "grad_norm": 11.735674858093262, "learning_rate": 1.0276679841897233e-06, "loss": 0.5547, "step": 25200 }, { "epoch": 4.744965179747789, "grad_norm": 7.690491199493408, "learning_rate": 1.0201392810088463e-06, "loss": 0.3842, "step": 25210 }, { "epoch": 4.746847355543007, "grad_norm": 12.007920265197754, "learning_rate": 1.0126105778279692e-06, "loss": 0.679, "step": 25220 }, { "epoch": 4.748729531338227, "grad_norm": 20.26568603515625, "learning_rate": 1.0050818746470921e-06, "loss": 0.751, "step": 25230 }, { "epoch": 4.750611707133446, "grad_norm": 0.7630726099014282, "learning_rate": 9.97553171466215e-07, "loss": 0.3219, "step": 25240 }, { "epoch": 4.752493882928666, "grad_norm": 5.351938247680664, "learning_rate": 9.90024468285338e-07, "loss": 0.6458, "step": 25250 }, { "epoch": 4.754376058723885, "grad_norm": 11.440779685974121, "learning_rate": 9.824957651044609e-07, "loss": 0.3074, "step": 25260 }, { "epoch": 4.7562582345191045, "grad_norm": 13.200923919677734, "learning_rate": 9.749670619235838e-07, "loss": 0.7268, "step": 25270 }, { "epoch": 4.758140410314323, "grad_norm": 0.8628940582275391, "learning_rate": 9.674383587427067e-07, "loss": 0.5937, "step": 25280 }, { "epoch": 4.760022586109542, "grad_norm": 12.860737800598145, "learning_rate": 9.599096555618294e-07, "loss": 0.6364, "step": 25290 }, { "epoch": 4.761904761904762, "grad_norm": 18.585010528564453, "learning_rate": 9.523809523809525e-07, "loss": 0.604, "step": 25300 }, { "epoch": 4.763786937699981, "grad_norm": 10.778792381286621, "learning_rate": 9.448522492000754e-07, "loss": 0.408, "step": 25310 }, { "epoch": 4.765669113495201, "grad_norm": 4.942008018493652, "learning_rate": 9.373235460191983e-07, "loss": 0.6934, "step": 25320 }, { "epoch": 4.76755128929042, "grad_norm": 9.399242401123047, "learning_rate": 9.297948428383212e-07, "loss": 0.4827, "step": 25330 }, { "epoch": 4.769433465085639, "grad_norm": 7.679773330688477, "learning_rate": 9.22266139657444e-07, "loss": 0.448, "step": 25340 }, { "epoch": 4.771315640880858, "grad_norm": 11.352853775024414, "learning_rate": 9.14737436476567e-07, "loss": 0.5005, "step": 25350 }, { "epoch": 4.773197816676078, "grad_norm": 15.323731422424316, "learning_rate": 9.072087332956899e-07, "loss": 0.8852, "step": 25360 }, { "epoch": 4.775079992471297, "grad_norm": 34.29592514038086, "learning_rate": 8.996800301148129e-07, "loss": 0.4909, "step": 25370 }, { "epoch": 4.776962168266516, "grad_norm": 15.671870231628418, "learning_rate": 8.921513269339357e-07, "loss": 0.5851, "step": 25380 }, { "epoch": 4.778844344061735, "grad_norm": 15.997258186340332, "learning_rate": 8.846226237530586e-07, "loss": 0.5856, "step": 25390 }, { "epoch": 4.780726519856954, "grad_norm": 7.108194828033447, "learning_rate": 8.770939205721816e-07, "loss": 0.6275, "step": 25400 }, { "epoch": 4.782608695652174, "grad_norm": 17.355302810668945, "learning_rate": 8.695652173913044e-07, "loss": 0.6376, "step": 25410 }, { "epoch": 4.784490871447393, "grad_norm": 2.995657205581665, "learning_rate": 8.620365142104273e-07, "loss": 0.8359, "step": 25420 }, { "epoch": 4.786373047242613, "grad_norm": 11.23421859741211, "learning_rate": 8.545078110295501e-07, "loss": 0.8888, "step": 25430 }, { "epoch": 4.788255223037831, "grad_norm": 11.181546211242676, "learning_rate": 8.469791078486731e-07, "loss": 0.8431, "step": 25440 }, { "epoch": 4.790137398833051, "grad_norm": 17.23093032836914, "learning_rate": 8.394504046677961e-07, "loss": 0.4511, "step": 25450 }, { "epoch": 4.79201957462827, "grad_norm": 17.615921020507812, "learning_rate": 8.31921701486919e-07, "loss": 0.7012, "step": 25460 }, { "epoch": 4.7939017504234895, "grad_norm": 3.298682451248169, "learning_rate": 8.243929983060418e-07, "loss": 0.5764, "step": 25470 }, { "epoch": 4.795783926218709, "grad_norm": 0.7209872603416443, "learning_rate": 8.168642951251647e-07, "loss": 0.5459, "step": 25480 }, { "epoch": 4.797666102013928, "grad_norm": 10.0849609375, "learning_rate": 8.093355919442876e-07, "loss": 0.4159, "step": 25490 }, { "epoch": 4.799548277809148, "grad_norm": 2.3432960510253906, "learning_rate": 8.018068887634107e-07, "loss": 0.5565, "step": 25500 }, { "epoch": 4.801430453604366, "grad_norm": 1.5209139585494995, "learning_rate": 7.942781855825335e-07, "loss": 0.3799, "step": 25510 }, { "epoch": 4.803312629399586, "grad_norm": 8.557663917541504, "learning_rate": 7.867494824016564e-07, "loss": 0.6082, "step": 25520 }, { "epoch": 4.805194805194805, "grad_norm": 42.79642105102539, "learning_rate": 7.792207792207792e-07, "loss": 0.5106, "step": 25530 }, { "epoch": 4.807076980990025, "grad_norm": 20.128494262695312, "learning_rate": 7.716920760399021e-07, "loss": 0.3495, "step": 25540 }, { "epoch": 4.808959156785244, "grad_norm": 8.818672180175781, "learning_rate": 7.641633728590251e-07, "loss": 0.4237, "step": 25550 }, { "epoch": 4.810841332580463, "grad_norm": 10.33991813659668, "learning_rate": 7.566346696781481e-07, "loss": 0.6665, "step": 25560 }, { "epoch": 4.812723508375682, "grad_norm": 15.527596473693848, "learning_rate": 7.491059664972709e-07, "loss": 0.5248, "step": 25570 }, { "epoch": 4.8146056841709015, "grad_norm": 1.0982279777526855, "learning_rate": 7.415772633163938e-07, "loss": 0.4951, "step": 25580 }, { "epoch": 4.816487859966121, "grad_norm": 10.367071151733398, "learning_rate": 7.340485601355168e-07, "loss": 0.5674, "step": 25590 }, { "epoch": 4.81837003576134, "grad_norm": 5.20026969909668, "learning_rate": 7.265198569546396e-07, "loss": 0.528, "step": 25600 }, { "epoch": 4.820252211556559, "grad_norm": 15.308959007263184, "learning_rate": 7.189911537737625e-07, "loss": 0.716, "step": 25610 }, { "epoch": 4.822134387351778, "grad_norm": 35.8542366027832, "learning_rate": 7.114624505928855e-07, "loss": 0.6604, "step": 25620 }, { "epoch": 4.824016563146998, "grad_norm": 45.377445220947266, "learning_rate": 7.039337474120083e-07, "loss": 0.5864, "step": 25630 }, { "epoch": 4.825898738942217, "grad_norm": 37.29194259643555, "learning_rate": 6.964050442311313e-07, "loss": 0.4791, "step": 25640 }, { "epoch": 4.827780914737437, "grad_norm": 0.8685435652732849, "learning_rate": 6.888763410502542e-07, "loss": 0.3548, "step": 25650 }, { "epoch": 4.829663090532656, "grad_norm": 10.831360816955566, "learning_rate": 6.81347637869377e-07, "loss": 0.4429, "step": 25660 }, { "epoch": 4.8315452663278755, "grad_norm": 15.885283470153809, "learning_rate": 6.738189346884999e-07, "loss": 0.4418, "step": 25670 }, { "epoch": 4.833427442123094, "grad_norm": 12.996732711791992, "learning_rate": 6.662902315076229e-07, "loss": 0.3234, "step": 25680 }, { "epoch": 4.8353096179183135, "grad_norm": 17.03729248046875, "learning_rate": 6.587615283267459e-07, "loss": 0.4515, "step": 25690 }, { "epoch": 4.837191793713533, "grad_norm": 4.803049087524414, "learning_rate": 6.512328251458687e-07, "loss": 0.5221, "step": 25700 }, { "epoch": 4.839073969508752, "grad_norm": 8.547290802001953, "learning_rate": 6.437041219649916e-07, "loss": 0.3565, "step": 25710 }, { "epoch": 4.840956145303972, "grad_norm": 7.503337860107422, "learning_rate": 6.361754187841144e-07, "loss": 0.6951, "step": 25720 }, { "epoch": 4.84283832109919, "grad_norm": 541.9999389648438, "learning_rate": 6.286467156032373e-07, "loss": 0.2751, "step": 25730 }, { "epoch": 4.84472049689441, "grad_norm": 12.69980239868164, "learning_rate": 6.211180124223603e-07, "loss": 0.4805, "step": 25740 }, { "epoch": 4.846602672689629, "grad_norm": 8.881111145019531, "learning_rate": 6.135893092414832e-07, "loss": 0.502, "step": 25750 }, { "epoch": 4.848484848484849, "grad_norm": 8.829740524291992, "learning_rate": 6.060606060606061e-07, "loss": 0.5651, "step": 25760 }, { "epoch": 4.850367024280068, "grad_norm": 21.692197799682617, "learning_rate": 5.98531902879729e-07, "loss": 0.6097, "step": 25770 }, { "epoch": 4.852249200075287, "grad_norm": 12.04153060913086, "learning_rate": 5.910031996988519e-07, "loss": 0.4574, "step": 25780 }, { "epoch": 4.854131375870506, "grad_norm": 10.96838092803955, "learning_rate": 5.834744965179749e-07, "loss": 0.5796, "step": 25790 }, { "epoch": 4.8560135516657255, "grad_norm": 17.027944564819336, "learning_rate": 5.759457933370977e-07, "loss": 0.6143, "step": 25800 }, { "epoch": 4.857895727460945, "grad_norm": 2.2098312377929688, "learning_rate": 5.684170901562206e-07, "loss": 0.5982, "step": 25810 }, { "epoch": 4.859777903256164, "grad_norm": 10.510016441345215, "learning_rate": 5.608883869753435e-07, "loss": 0.5048, "step": 25820 }, { "epoch": 4.861660079051384, "grad_norm": 3.657031536102295, "learning_rate": 5.533596837944664e-07, "loss": 0.5796, "step": 25830 }, { "epoch": 4.863542254846602, "grad_norm": 5.817481517791748, "learning_rate": 5.458309806135894e-07, "loss": 0.3377, "step": 25840 }, { "epoch": 4.865424430641822, "grad_norm": 0.44938135147094727, "learning_rate": 5.383022774327123e-07, "loss": 0.6224, "step": 25850 }, { "epoch": 4.867306606437041, "grad_norm": 67.99935913085938, "learning_rate": 5.307735742518352e-07, "loss": 0.8013, "step": 25860 }, { "epoch": 4.869188782232261, "grad_norm": 34.21944046020508, "learning_rate": 5.23244871070958e-07, "loss": 0.3706, "step": 25870 }, { "epoch": 4.87107095802748, "grad_norm": 12.056262016296387, "learning_rate": 5.15716167890081e-07, "loss": 0.7562, "step": 25880 }, { "epoch": 4.8729531338226995, "grad_norm": 8.577810287475586, "learning_rate": 5.081874647092039e-07, "loss": 0.5821, "step": 25890 }, { "epoch": 4.874835309617918, "grad_norm": 23.58852767944336, "learning_rate": 5.006587615283268e-07, "loss": 0.7323, "step": 25900 }, { "epoch": 4.876717485413137, "grad_norm": 17.26483726501465, "learning_rate": 4.931300583474497e-07, "loss": 0.4348, "step": 25910 }, { "epoch": 4.878599661208357, "grad_norm": 23.311981201171875, "learning_rate": 4.856013551665726e-07, "loss": 0.7779, "step": 25920 }, { "epoch": 4.880481837003576, "grad_norm": 19.61308479309082, "learning_rate": 4.780726519856954e-07, "loss": 0.8464, "step": 25930 }, { "epoch": 4.882364012798796, "grad_norm": 7.677709579467773, "learning_rate": 4.705439488048184e-07, "loss": 0.8081, "step": 25940 }, { "epoch": 4.884246188594014, "grad_norm": 13.238296508789062, "learning_rate": 4.6301524562394134e-07, "loss": 0.6175, "step": 25950 }, { "epoch": 4.886128364389234, "grad_norm": 25.431175231933594, "learning_rate": 4.554865424430642e-07, "loss": 0.6642, "step": 25960 }, { "epoch": 4.888010540184453, "grad_norm": 12.603880882263184, "learning_rate": 4.4795783926218713e-07, "loss": 0.6011, "step": 25970 }, { "epoch": 4.889892715979673, "grad_norm": 43.16578674316406, "learning_rate": 4.4042913608131005e-07, "loss": 0.4965, "step": 25980 }, { "epoch": 4.891774891774892, "grad_norm": 9.851216316223145, "learning_rate": 4.329004329004329e-07, "loss": 0.4479, "step": 25990 }, { "epoch": 4.893657067570111, "grad_norm": 11.268643379211426, "learning_rate": 4.253717297195559e-07, "loss": 0.5438, "step": 26000 }, { "epoch": 4.89553924336533, "grad_norm": 24.760019302368164, "learning_rate": 4.1784302653867876e-07, "loss": 0.5283, "step": 26010 }, { "epoch": 4.897421419160549, "grad_norm": 25.229755401611328, "learning_rate": 4.1031432335780163e-07, "loss": 0.784, "step": 26020 }, { "epoch": 4.899303594955769, "grad_norm": 9.104552268981934, "learning_rate": 4.027856201769246e-07, "loss": 0.4686, "step": 26030 }, { "epoch": 4.901185770750988, "grad_norm": 6.206742286682129, "learning_rate": 3.9525691699604747e-07, "loss": 0.7664, "step": 26040 }, { "epoch": 4.903067946546208, "grad_norm": 15.448500633239746, "learning_rate": 3.8772821381517034e-07, "loss": 0.3457, "step": 26050 }, { "epoch": 4.904950122341427, "grad_norm": 31.12100601196289, "learning_rate": 3.801995106342933e-07, "loss": 0.5846, "step": 26060 }, { "epoch": 4.906832298136646, "grad_norm": 14.74261474609375, "learning_rate": 3.726708074534162e-07, "loss": 0.6615, "step": 26070 }, { "epoch": 4.908714473931865, "grad_norm": 26.152849197387695, "learning_rate": 3.6514210427253905e-07, "loss": 0.6266, "step": 26080 }, { "epoch": 4.9105966497270845, "grad_norm": 27.336023330688477, "learning_rate": 3.5761340109166203e-07, "loss": 0.8022, "step": 26090 }, { "epoch": 4.912478825522304, "grad_norm": 4.440485000610352, "learning_rate": 3.500846979107849e-07, "loss": 0.3889, "step": 26100 }, { "epoch": 4.914361001317523, "grad_norm": 31.62215805053711, "learning_rate": 3.4255599472990776e-07, "loss": 0.2929, "step": 26110 }, { "epoch": 4.916243177112742, "grad_norm": 18.474529266357422, "learning_rate": 3.350272915490307e-07, "loss": 0.7286, "step": 26120 }, { "epoch": 4.918125352907961, "grad_norm": 1.3210591077804565, "learning_rate": 3.274985883681536e-07, "loss": 0.5917, "step": 26130 }, { "epoch": 4.920007528703181, "grad_norm": 22.190696716308594, "learning_rate": 3.1996988518727653e-07, "loss": 0.4259, "step": 26140 }, { "epoch": 4.9218897044984, "grad_norm": 32.777915954589844, "learning_rate": 3.1244118200639945e-07, "loss": 0.458, "step": 26150 }, { "epoch": 4.92377188029362, "grad_norm": 8.841329574584961, "learning_rate": 3.049124788255223e-07, "loss": 0.6918, "step": 26160 }, { "epoch": 4.925654056088838, "grad_norm": 12.418993949890137, "learning_rate": 2.9738377564464524e-07, "loss": 0.581, "step": 26170 }, { "epoch": 4.927536231884058, "grad_norm": 8.253694534301758, "learning_rate": 2.8985507246376816e-07, "loss": 0.7705, "step": 26180 }, { "epoch": 4.929418407679277, "grad_norm": 9.630890846252441, "learning_rate": 2.823263692828911e-07, "loss": 0.594, "step": 26190 }, { "epoch": 4.9313005834744965, "grad_norm": 0.2443116456270218, "learning_rate": 2.7479766610201395e-07, "loss": 0.9983, "step": 26200 }, { "epoch": 4.933182759269716, "grad_norm": 12.538666725158691, "learning_rate": 2.6726896292113687e-07, "loss": 0.8449, "step": 26210 }, { "epoch": 4.935064935064935, "grad_norm": 7.572700023651123, "learning_rate": 2.597402597402598e-07, "loss": 0.4725, "step": 26220 }, { "epoch": 4.936947110860155, "grad_norm": 7.338517189025879, "learning_rate": 2.5221155655938266e-07, "loss": 0.8398, "step": 26230 }, { "epoch": 4.938829286655373, "grad_norm": 4.189211368560791, "learning_rate": 2.446828533785056e-07, "loss": 0.5246, "step": 26240 }, { "epoch": 4.940711462450593, "grad_norm": 10.774524688720703, "learning_rate": 2.3715415019762845e-07, "loss": 0.4691, "step": 26250 }, { "epoch": 4.942593638245812, "grad_norm": 0.5024207234382629, "learning_rate": 2.2962544701675137e-07, "loss": 0.5102, "step": 26260 }, { "epoch": 4.944475814041032, "grad_norm": 5.542119026184082, "learning_rate": 2.220967438358743e-07, "loss": 0.5087, "step": 26270 }, { "epoch": 4.946357989836251, "grad_norm": 17.346418380737305, "learning_rate": 2.145680406549972e-07, "loss": 0.512, "step": 26280 }, { "epoch": 4.94824016563147, "grad_norm": 7.274904251098633, "learning_rate": 2.0703933747412008e-07, "loss": 0.5541, "step": 26290 }, { "epoch": 4.950122341426689, "grad_norm": 7.677774429321289, "learning_rate": 1.99510634293243e-07, "loss": 0.5137, "step": 26300 }, { "epoch": 4.9520045172219085, "grad_norm": 0.39065951108932495, "learning_rate": 1.919819311123659e-07, "loss": 0.6084, "step": 26310 }, { "epoch": 4.953886693017128, "grad_norm": 28.504114151000977, "learning_rate": 1.8445322793148882e-07, "loss": 0.5155, "step": 26320 }, { "epoch": 4.955768868812347, "grad_norm": 11.31637954711914, "learning_rate": 1.7692452475061174e-07, "loss": 0.7137, "step": 26330 }, { "epoch": 4.957651044607566, "grad_norm": 1.688535213470459, "learning_rate": 1.693958215697346e-07, "loss": 0.2389, "step": 26340 }, { "epoch": 4.959533220402785, "grad_norm": 8.698450088500977, "learning_rate": 1.6186711838885753e-07, "loss": 0.4807, "step": 26350 }, { "epoch": 4.961415396198005, "grad_norm": 30.824499130249023, "learning_rate": 1.5433841520798043e-07, "loss": 0.5214, "step": 26360 }, { "epoch": 4.963297571993224, "grad_norm": 2.2312653064727783, "learning_rate": 1.4680971202710335e-07, "loss": 0.6085, "step": 26370 }, { "epoch": 4.965179747788444, "grad_norm": 2.4065072536468506, "learning_rate": 1.3928100884622625e-07, "loss": 0.4064, "step": 26380 }, { "epoch": 4.967061923583663, "grad_norm": 17.495332717895508, "learning_rate": 1.3175230566534914e-07, "loss": 0.5247, "step": 26390 }, { "epoch": 4.9689440993788825, "grad_norm": 14.244489669799805, "learning_rate": 1.2422360248447206e-07, "loss": 0.5497, "step": 26400 }, { "epoch": 4.970826275174101, "grad_norm": 6.616152286529541, "learning_rate": 1.1669489930359497e-07, "loss": 0.7547, "step": 26410 }, { "epoch": 4.9727084509693205, "grad_norm": 36.595096588134766, "learning_rate": 1.0916619612271786e-07, "loss": 0.6431, "step": 26420 }, { "epoch": 4.97459062676454, "grad_norm": 17.054536819458008, "learning_rate": 1.0163749294184077e-07, "loss": 0.4411, "step": 26430 }, { "epoch": 4.976472802559759, "grad_norm": 14.136749267578125, "learning_rate": 9.410878976096368e-08, "loss": 0.8405, "step": 26440 }, { "epoch": 4.978354978354979, "grad_norm": 9.22201156616211, "learning_rate": 8.658008658008659e-08, "loss": 0.3544, "step": 26450 }, { "epoch": 4.980237154150197, "grad_norm": 19.09043312072754, "learning_rate": 7.905138339920948e-08, "loss": 0.5576, "step": 26460 }, { "epoch": 4.982119329945417, "grad_norm": 4.943528652191162, "learning_rate": 7.152268021833239e-08, "loss": 0.3464, "step": 26470 }, { "epoch": 4.984001505740636, "grad_norm": 1.3135852813720703, "learning_rate": 6.39939770374553e-08, "loss": 0.5733, "step": 26480 }, { "epoch": 4.985883681535856, "grad_norm": 19.596176147460938, "learning_rate": 5.64652738565782e-08, "loss": 0.4562, "step": 26490 }, { "epoch": 4.987765857331075, "grad_norm": 31.44451141357422, "learning_rate": 4.893657067570112e-08, "loss": 0.6244, "step": 26500 }, { "epoch": 4.989648033126294, "grad_norm": 10.986721992492676, "learning_rate": 4.1407867494824025e-08, "loss": 0.6193, "step": 26510 }, { "epoch": 4.991530208921513, "grad_norm": 10.592276573181152, "learning_rate": 3.3879164313946926e-08, "loss": 0.649, "step": 26520 }, { "epoch": 4.993412384716732, "grad_norm": 11.80045223236084, "learning_rate": 2.635046113306983e-08, "loss": 0.5307, "step": 26530 }, { "epoch": 4.995294560511952, "grad_norm": 24.187894821166992, "learning_rate": 1.8821757952192736e-08, "loss": 0.7837, "step": 26540 }, { "epoch": 4.997176736307171, "grad_norm": 8.47293472290039, "learning_rate": 1.1293054771315641e-08, "loss": 0.6338, "step": 26550 }, { "epoch": 4.999058912102391, "grad_norm": 63.684627532958984, "learning_rate": 3.7643515904385476e-09, "loss": 0.4886, "step": 26560 }, { "epoch": 5.0, "eval_accuracy": 0.9241333333333334, "eval_loss": 0.2954551577568054, "eval_runtime": 116.8437, "eval_samples_per_second": 64.188, "eval_steps_per_second": 8.028, "step": 26565 }, { "epoch": 5.0, "step": 26565, "total_flos": 1.64815115092992e+19, "train_loss": 0.0, "train_runtime": 0.0821, "train_samples_per_second": 2587573.233, "train_steps_per_second": 323477.096 } ], "logging_steps": 10, "max_steps": 26565, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.64815115092992e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }