|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0180412891247754, |
|
"eval_steps": 500, |
|
"global_step": 25000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 7.78010667114874, |
|
"learning_rate": 4.0236686390532546e-07, |
|
"loss": 0.5346, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 6.216957639101055, |
|
"learning_rate": 6.153846153846155e-07, |
|
"loss": 0.5043, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 7.990357548396736, |
|
"learning_rate": 8.284023668639055e-07, |
|
"loss": 0.5017, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 8.12688107356609, |
|
"learning_rate": 1.0414201183431955e-06, |
|
"loss": 0.4952, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 6.530843475685683, |
|
"learning_rate": 1.2544378698224851e-06, |
|
"loss": 0.5621, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 6.308467672405027, |
|
"learning_rate": 1.4674556213017752e-06, |
|
"loss": 0.4549, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 7.243052870190241, |
|
"learning_rate": 1.6804733727810652e-06, |
|
"loss": 0.4466, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 10.219726515841495, |
|
"learning_rate": 1.8934911242603552e-06, |
|
"loss": 0.3893, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 6.627649317339657, |
|
"learning_rate": 2.106508875739645e-06, |
|
"loss": 0.4179, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 6.44244224679364, |
|
"learning_rate": 2.319526627218935e-06, |
|
"loss": 0.4226, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 6.612950721768246, |
|
"learning_rate": 2.532544378698225e-06, |
|
"loss": 0.3795, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 6.283517912051673, |
|
"learning_rate": 2.7455621301775153e-06, |
|
"loss": 0.4276, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 7.268987062349035, |
|
"learning_rate": 2.958579881656805e-06, |
|
"loss": 0.3619, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 8.340583800596072, |
|
"learning_rate": 3.171597633136095e-06, |
|
"loss": 0.4244, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 6.75600646477272, |
|
"learning_rate": 3.384615384615385e-06, |
|
"loss": 0.3852, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 5.647054711391784, |
|
"learning_rate": 3.597633136094675e-06, |
|
"loss": 0.3809, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 7.253045067435066, |
|
"learning_rate": 3.8106508875739652e-06, |
|
"loss": 0.3858, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 7.301184351749545, |
|
"learning_rate": 4.023668639053255e-06, |
|
"loss": 0.3549, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 7.59195138486003, |
|
"learning_rate": 4.236686390532545e-06, |
|
"loss": 0.4048, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 10.124611929532549, |
|
"learning_rate": 4.449704142011835e-06, |
|
"loss": 0.3646, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 7.744526068197853, |
|
"learning_rate": 4.662721893491124e-06, |
|
"loss": 0.3677, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 8.246611853098463, |
|
"learning_rate": 4.875739644970415e-06, |
|
"loss": 0.3573, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 7.675393597081337, |
|
"learning_rate": 5.088757396449705e-06, |
|
"loss": 0.3714, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 7.407374558199348, |
|
"learning_rate": 5.301775147928995e-06, |
|
"loss": 0.3762, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 7.5336943019480875, |
|
"learning_rate": 5.514792899408284e-06, |
|
"loss": 0.3423, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 6.789944607897793, |
|
"learning_rate": 5.727810650887574e-06, |
|
"loss": 0.3382, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 6.2048712513712765, |
|
"learning_rate": 5.940828402366864e-06, |
|
"loss": 0.355, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 7.7676635768181255, |
|
"learning_rate": 6.153846153846153e-06, |
|
"loss": 0.3481, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 5.33865756273462, |
|
"learning_rate": 6.366863905325444e-06, |
|
"loss": 0.3486, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 7.215416340807466, |
|
"learning_rate": 6.579881656804735e-06, |
|
"loss": 0.3378, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 6.563753135562715, |
|
"learning_rate": 6.792899408284025e-06, |
|
"loss": 0.3126, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 8.45042187241426, |
|
"learning_rate": 7.005917159763315e-06, |
|
"loss": 0.3231, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 6.734275587016377, |
|
"learning_rate": 7.218934911242604e-06, |
|
"loss": 0.3533, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 6.417837093997314, |
|
"learning_rate": 7.431952662721894e-06, |
|
"loss": 0.3335, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 8.615382928114506, |
|
"learning_rate": 7.644970414201183e-06, |
|
"loss": 0.3495, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 6.6395133490495395, |
|
"learning_rate": 7.857988165680473e-06, |
|
"loss": 0.3371, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 9.710125890008053, |
|
"learning_rate": 8.071005917159764e-06, |
|
"loss": 0.3619, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 8.21056721557438, |
|
"learning_rate": 8.284023668639054e-06, |
|
"loss": 0.3479, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 7.0850591314913185, |
|
"learning_rate": 8.497041420118344e-06, |
|
"loss": 0.3217, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 8.709374421841343, |
|
"learning_rate": 8.710059171597634e-06, |
|
"loss": 0.345, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 8.703998514988717, |
|
"learning_rate": 8.923076923076925e-06, |
|
"loss": 0.3544, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 8.233699532008588, |
|
"learning_rate": 9.136094674556215e-06, |
|
"loss": 0.3449, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 6.360611479052244, |
|
"learning_rate": 9.349112426035503e-06, |
|
"loss": 0.3709, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 5.21422760141616, |
|
"learning_rate": 9.562130177514794e-06, |
|
"loss": 0.3031, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 10.285280127672143, |
|
"learning_rate": 9.775147928994084e-06, |
|
"loss": 0.314, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 8.561551897783339, |
|
"learning_rate": 9.988165680473372e-06, |
|
"loss": 0.34, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 8.337797520489195, |
|
"learning_rate": 1.0201183431952664e-05, |
|
"loss": 0.3324, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 9.531967986532953, |
|
"learning_rate": 1.0414201183431953e-05, |
|
"loss": 0.359, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 6.913190435381454, |
|
"learning_rate": 1.0627218934911243e-05, |
|
"loss": 0.3715, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 8.525404719704843, |
|
"learning_rate": 1.0840236686390533e-05, |
|
"loss": 0.2926, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 8.186212103501571, |
|
"learning_rate": 1.1053254437869825e-05, |
|
"loss": 0.351, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 5.968895396114415, |
|
"learning_rate": 1.1266272189349114e-05, |
|
"loss": 0.3325, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 9.069989076248202, |
|
"learning_rate": 1.1479289940828404e-05, |
|
"loss": 0.3075, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 6.617321242745247, |
|
"learning_rate": 1.1692307692307694e-05, |
|
"loss": 0.3321, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 8.096724387107496, |
|
"learning_rate": 1.1905325443786983e-05, |
|
"loss": 0.3508, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 7.972357677530315, |
|
"learning_rate": 1.2118343195266273e-05, |
|
"loss": 0.3031, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 7.82798416711515, |
|
"learning_rate": 1.2331360946745563e-05, |
|
"loss": 0.2912, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 10.732472169183794, |
|
"learning_rate": 1.2544378698224854e-05, |
|
"loss": 0.2956, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 7.4415223600685625, |
|
"learning_rate": 1.2757396449704142e-05, |
|
"loss": 0.3234, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 7.006979596197168, |
|
"learning_rate": 1.2970414201183432e-05, |
|
"loss": 0.3271, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 8.00254256852378, |
|
"learning_rate": 1.3183431952662723e-05, |
|
"loss": 0.3165, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 14.742499561979336, |
|
"learning_rate": 1.3396449704142011e-05, |
|
"loss": 0.3015, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 7.14191343340385, |
|
"learning_rate": 1.3609467455621301e-05, |
|
"loss": 0.3273, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 5.720523746760943, |
|
"learning_rate": 1.3822485207100593e-05, |
|
"loss": 0.3073, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 29.467573544915435, |
|
"learning_rate": 1.4035502958579883e-05, |
|
"loss": 0.2779, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 6.058319538140977, |
|
"learning_rate": 1.4248520710059172e-05, |
|
"loss": 0.3176, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 5.979724640901607, |
|
"learning_rate": 1.4461538461538462e-05, |
|
"loss": 0.2919, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 9.296561099611436, |
|
"learning_rate": 1.4674556213017754e-05, |
|
"loss": 0.3073, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 6.766863350601436, |
|
"learning_rate": 1.4887573964497044e-05, |
|
"loss": 0.3449, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 9.0738349271763, |
|
"learning_rate": 1.5100591715976333e-05, |
|
"loss": 0.3282, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 5.238889472577541, |
|
"learning_rate": 1.5313609467455623e-05, |
|
"loss": 0.2766, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 6.068739678701155, |
|
"learning_rate": 1.5526627218934912e-05, |
|
"loss": 0.3527, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 8.761831948666881, |
|
"learning_rate": 1.5739644970414204e-05, |
|
"loss": 0.309, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 5.696276254834093, |
|
"learning_rate": 1.5952662721893492e-05, |
|
"loss": 0.3334, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 5.698639074786848, |
|
"learning_rate": 1.616568047337278e-05, |
|
"loss": 0.306, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 6.244148261549131, |
|
"learning_rate": 1.6378698224852073e-05, |
|
"loss": 0.3, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 6.948838050368144, |
|
"learning_rate": 1.659171597633136e-05, |
|
"loss": 0.3348, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 6.672363835099913, |
|
"learning_rate": 1.6804733727810653e-05, |
|
"loss": 0.3534, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 50.29837243421749, |
|
"learning_rate": 1.7017751479289942e-05, |
|
"loss": 0.3203, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 7.03929850767991, |
|
"learning_rate": 1.723076923076923e-05, |
|
"loss": 0.297, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 7.19469430195841, |
|
"learning_rate": 1.7443786982248522e-05, |
|
"loss": 0.2978, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 7.055651559785636, |
|
"learning_rate": 1.765680473372781e-05, |
|
"loss": 0.3313, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 7.162934891578056, |
|
"learning_rate": 1.78698224852071e-05, |
|
"loss": 0.3188, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 7.176153099629785, |
|
"learning_rate": 1.808284023668639e-05, |
|
"loss": 0.3297, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 6.367781797418254, |
|
"learning_rate": 1.8295857988165683e-05, |
|
"loss": 0.3042, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 6.373173769997794, |
|
"learning_rate": 1.8508875739644975e-05, |
|
"loss": 0.3266, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 18.577261761115807, |
|
"learning_rate": 1.8721893491124264e-05, |
|
"loss": 0.2736, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 4.994768648489008, |
|
"learning_rate": 1.8934911242603552e-05, |
|
"loss": 0.3012, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 6.301807374403158, |
|
"learning_rate": 1.9147928994082844e-05, |
|
"loss": 0.3394, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 7.456967816650055, |
|
"learning_rate": 1.9360946745562133e-05, |
|
"loss": 0.3329, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 6.0971107018925625, |
|
"learning_rate": 1.957396449704142e-05, |
|
"loss": 0.3523, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 8.10379030061416, |
|
"learning_rate": 1.9786982248520713e-05, |
|
"loss": 0.3013, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 7.024454181142386, |
|
"learning_rate": 1.9999999960412883e-05, |
|
"loss": 0.2762, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 8.232339625552946, |
|
"learning_rate": 1.9999982542086008e-05, |
|
"loss": 0.3448, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 7.602098687169412, |
|
"learning_rate": 1.9999933454128334e-05, |
|
"loss": 0.3398, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 13.60330741158854, |
|
"learning_rate": 1.9999852696695326e-05, |
|
"loss": 0.3449, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 6.868480851347135, |
|
"learning_rate": 1.9999740270042764e-05, |
|
"loss": 0.3047, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 8.419546962701075, |
|
"learning_rate": 1.9999596174526744e-05, |
|
"loss": 0.296, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 8.258953437915027, |
|
"learning_rate": 1.9999420410603655e-05, |
|
"loss": 0.3145, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 8.591067141592344, |
|
"learning_rate": 1.9999212978830192e-05, |
|
"loss": 0.2967, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 7.788694443802249, |
|
"learning_rate": 1.9998973879863347e-05, |
|
"loss": 0.2922, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 8.894924525369044, |
|
"learning_rate": 1.999870311446042e-05, |
|
"loss": 0.2909, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 7.71785690762546, |
|
"learning_rate": 1.9998400683478994e-05, |
|
"loss": 0.3185, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 6.423565941270898, |
|
"learning_rate": 1.9998066587876964e-05, |
|
"loss": 0.331, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 9.504973464920754, |
|
"learning_rate": 1.9997700828712502e-05, |
|
"loss": 0.3163, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 5.0751162961967555, |
|
"learning_rate": 1.999730340714407e-05, |
|
"loss": 0.2935, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 6.788588249402219, |
|
"learning_rate": 1.9996874324430414e-05, |
|
"loss": 0.304, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 6.982728358658279, |
|
"learning_rate": 1.9996413581930564e-05, |
|
"loss": 0.3254, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 7.39883321894128, |
|
"learning_rate": 1.9995921181103827e-05, |
|
"loss": 0.3238, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 6.349466388539233, |
|
"learning_rate": 1.999539712350977e-05, |
|
"loss": 0.3138, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 8.460848680137447, |
|
"learning_rate": 1.9994841410808238e-05, |
|
"loss": 0.2951, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 12.228980891015102, |
|
"learning_rate": 1.999425404475933e-05, |
|
"loss": 0.313, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 10.128152992141587, |
|
"learning_rate": 1.99936350272234e-05, |
|
"loss": 0.3209, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 7.366413095980619, |
|
"learning_rate": 1.999298436016105e-05, |
|
"loss": 0.3508, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 5.935510884433497, |
|
"learning_rate": 1.9992302045633138e-05, |
|
"loss": 0.3087, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 9.917097921103624, |
|
"learning_rate": 1.9991588085800745e-05, |
|
"loss": 0.3272, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 8.158864972330328, |
|
"learning_rate": 1.9990842482925183e-05, |
|
"loss": 0.3097, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 5.660258353439845, |
|
"learning_rate": 1.999006523936799e-05, |
|
"loss": 0.3194, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 6.343908148236521, |
|
"learning_rate": 1.9989256357590915e-05, |
|
"loss": 0.3144, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 6.935522124005399, |
|
"learning_rate": 1.9988415840155925e-05, |
|
"loss": 0.316, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 6.118420550913593, |
|
"learning_rate": 1.9987543689725172e-05, |
|
"loss": 0.2935, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 5.852760915435719, |
|
"learning_rate": 1.998663990906101e-05, |
|
"loss": 0.2982, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 10.637858430267903, |
|
"learning_rate": 1.9985704501025967e-05, |
|
"loss": 0.3263, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 5.969298652078407, |
|
"learning_rate": 1.9984737468582746e-05, |
|
"loss": 0.2785, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 7.290609868079292, |
|
"learning_rate": 1.998373881479422e-05, |
|
"loss": 0.2902, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 8.419267863068479, |
|
"learning_rate": 1.9982708542823405e-05, |
|
"loss": 0.2854, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 8.123890492905641, |
|
"learning_rate": 1.9981646655933466e-05, |
|
"loss": 0.2981, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 5.10058603098674, |
|
"learning_rate": 1.998055315748771e-05, |
|
"loss": 0.2792, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 5.6118366078785105, |
|
"learning_rate": 1.997942805094955e-05, |
|
"loss": 0.2905, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 11.075016161215812, |
|
"learning_rate": 1.997827133988252e-05, |
|
"loss": 0.2902, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 8.887207232453743, |
|
"learning_rate": 1.997708302795026e-05, |
|
"loss": 0.3155, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 8.39711790141671, |
|
"learning_rate": 1.997586311891649e-05, |
|
"loss": 0.286, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 5.439201702560111, |
|
"learning_rate": 1.9974611616645007e-05, |
|
"loss": 0.2933, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 7.182735290178756, |
|
"learning_rate": 1.9973328525099675e-05, |
|
"loss": 0.3267, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 8.170254081594555, |
|
"learning_rate": 1.997201384834442e-05, |
|
"loss": 0.2967, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 6.357829888020736, |
|
"learning_rate": 1.997066759054319e-05, |
|
"loss": 0.3109, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 6.180030398494684, |
|
"learning_rate": 1.996928975595997e-05, |
|
"loss": 0.3054, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 7.812651853992933, |
|
"learning_rate": 1.996788034895875e-05, |
|
"loss": 0.2852, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 6.492759192826664, |
|
"learning_rate": 1.9966439374003538e-05, |
|
"loss": 0.305, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 11.941328303638311, |
|
"learning_rate": 1.99649668356583e-05, |
|
"loss": 0.2922, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 8.314661648415811, |
|
"learning_rate": 1.9963462738586993e-05, |
|
"loss": 0.3102, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 7.301474320450444, |
|
"learning_rate": 1.996192708755351e-05, |
|
"loss": 0.2964, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 8.797389284755965, |
|
"learning_rate": 1.996035988742171e-05, |
|
"loss": 0.2656, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 6.614687108304631, |
|
"learning_rate": 1.9958761143155357e-05, |
|
"loss": 0.2927, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 7.623906291629947, |
|
"learning_rate": 1.995713085981813e-05, |
|
"loss": 0.2788, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 5.684942655651583, |
|
"learning_rate": 1.9955469042573605e-05, |
|
"loss": 0.3051, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 8.632124459996572, |
|
"learning_rate": 1.9953775696685223e-05, |
|
"loss": 0.3002, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 4.942563514745161, |
|
"learning_rate": 1.99520508275163e-05, |
|
"loss": 0.2862, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 6.712590574651518, |
|
"learning_rate": 1.995029444052999e-05, |
|
"loss": 0.2938, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 7.09759309831908, |
|
"learning_rate": 1.9948506541289266e-05, |
|
"loss": 0.3054, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 9.042408105595795, |
|
"learning_rate": 1.994668713545692e-05, |
|
"loss": 0.3041, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 6.843804903550461, |
|
"learning_rate": 1.994483622879553e-05, |
|
"loss": 0.2958, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 8.109882081629157, |
|
"learning_rate": 1.9942953827167443e-05, |
|
"loss": 0.3115, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 6.519937602019556, |
|
"learning_rate": 1.994103993653476e-05, |
|
"loss": 0.2873, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 43.825640526729615, |
|
"learning_rate": 1.9939094562959324e-05, |
|
"loss": 0.3084, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 6.740302289754918, |
|
"learning_rate": 1.993711771260268e-05, |
|
"loss": 0.2898, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 8.585263920916868, |
|
"learning_rate": 1.993510939172609e-05, |
|
"loss": 0.303, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 7.715090235382078, |
|
"learning_rate": 1.9933069606690468e-05, |
|
"loss": 0.3102, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 9.780531981807941, |
|
"learning_rate": 1.99309983639564e-05, |
|
"loss": 0.3077, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 4.606128027451412, |
|
"learning_rate": 1.99288956700841e-05, |
|
"loss": 0.3131, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 6.5456142622794875, |
|
"learning_rate": 1.9926761531733403e-05, |
|
"loss": 0.2899, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 6.881966685047346, |
|
"learning_rate": 1.9924595955663732e-05, |
|
"loss": 0.2834, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 6.086009895569889, |
|
"learning_rate": 1.9922398948734088e-05, |
|
"loss": 0.2887, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 5.329585705771699, |
|
"learning_rate": 1.992017051790301e-05, |
|
"loss": 0.2888, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 9.3500534790468, |
|
"learning_rate": 1.991791067022858e-05, |
|
"loss": 0.3168, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 6.741688450171789, |
|
"learning_rate": 1.9915619412868387e-05, |
|
"loss": 0.2703, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 6.86462812934889, |
|
"learning_rate": 1.9913296753079484e-05, |
|
"loss": 0.3141, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 6.6699035733643495, |
|
"learning_rate": 1.9910942698218404e-05, |
|
"loss": 0.2922, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 8.43685481112505, |
|
"learning_rate": 1.990855725574111e-05, |
|
"loss": 0.2836, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 10.06638161800925, |
|
"learning_rate": 1.990614043320298e-05, |
|
"loss": 0.2949, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 8.362504433942911, |
|
"learning_rate": 1.9903692238258783e-05, |
|
"loss": 0.2897, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 6.788699791177713, |
|
"learning_rate": 1.9901212678662646e-05, |
|
"loss": 0.2907, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 6.40238575575375, |
|
"learning_rate": 1.989870176226804e-05, |
|
"loss": 0.2609, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 5.227206933131435, |
|
"learning_rate": 1.9896159497027758e-05, |
|
"loss": 0.3162, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 6.191089860311128, |
|
"learning_rate": 1.9893585890993877e-05, |
|
"loss": 0.2998, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 8.156421644955156, |
|
"learning_rate": 1.9890980952317745e-05, |
|
"loss": 0.2683, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 7.62639892752842, |
|
"learning_rate": 1.9888344689249945e-05, |
|
"loss": 0.3138, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 8.10913885283575, |
|
"learning_rate": 1.9885677110140272e-05, |
|
"loss": 0.3098, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 5.974197538110473, |
|
"learning_rate": 1.988297822343771e-05, |
|
"loss": 0.2879, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 8.13170124417466, |
|
"learning_rate": 1.9880248037690406e-05, |
|
"loss": 0.2741, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 5.373939941911109, |
|
"learning_rate": 1.9877486561545635e-05, |
|
"loss": 0.2818, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 6.876975035910139, |
|
"learning_rate": 1.9874693803749786e-05, |
|
"loss": 0.2872, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 11.88859663115872, |
|
"learning_rate": 1.987186977314831e-05, |
|
"loss": 0.2787, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 5.296482127875842, |
|
"learning_rate": 1.9869014478685726e-05, |
|
"loss": 0.3125, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 10.902431223896663, |
|
"learning_rate": 1.986612792940556e-05, |
|
"loss": 0.2696, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 7.957172435618448, |
|
"learning_rate": 1.986321013445034e-05, |
|
"loss": 0.2846, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 5.49530713404051, |
|
"learning_rate": 1.9860261103061555e-05, |
|
"loss": 0.2904, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 6.7681775640908315, |
|
"learning_rate": 1.985728084457963e-05, |
|
"loss": 0.2907, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 11.417291183282801, |
|
"learning_rate": 1.9854269368443898e-05, |
|
"loss": 0.3124, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 9.165271676007183, |
|
"learning_rate": 1.985122668419255e-05, |
|
"loss": 0.2938, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 9.710590629489802, |
|
"learning_rate": 1.984815280146265e-05, |
|
"loss": 0.2805, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 10.32416184835814, |
|
"learning_rate": 1.9845047729990052e-05, |
|
"loss": 0.2939, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 6.123004510419631, |
|
"learning_rate": 1.984191147960941e-05, |
|
"loss": 0.3217, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 8.419418288045916, |
|
"learning_rate": 1.9838744060254113e-05, |
|
"loss": 0.2466, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 8.941869987837809, |
|
"learning_rate": 1.9835545481956295e-05, |
|
"loss": 0.3091, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 6.854852736746462, |
|
"learning_rate": 1.983231575484676e-05, |
|
"loss": 0.3094, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 10.162127205743055, |
|
"learning_rate": 1.9829054889154978e-05, |
|
"loss": 0.2988, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 6.1276753090877385, |
|
"learning_rate": 1.982576289520904e-05, |
|
"loss": 0.2875, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 6.806977159453115, |
|
"learning_rate": 1.982243978343562e-05, |
|
"loss": 0.2943, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 6.9055487505442015, |
|
"learning_rate": 1.9819085564359977e-05, |
|
"loss": 0.2911, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 7.466453294884225, |
|
"learning_rate": 1.9815700248605875e-05, |
|
"loss": 0.2902, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 5.488844395318609, |
|
"learning_rate": 1.9812283846895572e-05, |
|
"loss": 0.2773, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 5.492586688406755, |
|
"learning_rate": 1.9808836370049786e-05, |
|
"loss": 0.2942, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 6.870365314571275, |
|
"learning_rate": 1.980535782898766e-05, |
|
"loss": 0.3134, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 6.474349542297636, |
|
"learning_rate": 1.9801848234726733e-05, |
|
"loss": 0.278, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 7.02153354250866, |
|
"learning_rate": 1.9798307598382887e-05, |
|
"loss": 0.3008, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 6.5342549251431725, |
|
"learning_rate": 1.9794735931170323e-05, |
|
"loss": 0.2588, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 7.235161691162515, |
|
"learning_rate": 1.9791133244401536e-05, |
|
"loss": 0.2892, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 6.613883714897734, |
|
"learning_rate": 1.978749954948726e-05, |
|
"loss": 0.3042, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 5.588985182579549, |
|
"learning_rate": 1.978383485793645e-05, |
|
"loss": 0.2895, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 41.788686405813685, |
|
"learning_rate": 1.9780139181356223e-05, |
|
"loss": 0.2967, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 8.000329487691184, |
|
"learning_rate": 1.9776412531451845e-05, |
|
"loss": 0.3068, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 8.858664509374336, |
|
"learning_rate": 1.977265492002667e-05, |
|
"loss": 0.2904, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 6.412322055660321, |
|
"learning_rate": 1.9768866358982138e-05, |
|
"loss": 0.302, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 6.15402072878952, |
|
"learning_rate": 1.9765046860317697e-05, |
|
"loss": 0.2753, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 7.363823390602094, |
|
"learning_rate": 1.9761196436130792e-05, |
|
"loss": 0.3077, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 5.820012641709484, |
|
"learning_rate": 1.9757315098616813e-05, |
|
"loss": 0.3024, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 5.291771334516593, |
|
"learning_rate": 1.975340286006906e-05, |
|
"loss": 0.2732, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 5.880570388428466, |
|
"learning_rate": 1.9749459732878716e-05, |
|
"loss": 0.2491, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 5.655270317760537, |
|
"learning_rate": 1.9745485729534788e-05, |
|
"loss": 0.2803, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 6.013092379821028, |
|
"learning_rate": 1.974148086262408e-05, |
|
"loss": 0.2803, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 5.7211652147787975, |
|
"learning_rate": 1.9737445144831136e-05, |
|
"loss": 0.2637, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 8.131187547800137, |
|
"learning_rate": 1.973337858893824e-05, |
|
"loss": 0.3255, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 7.137552013307909, |
|
"learning_rate": 1.972928120782533e-05, |
|
"loss": 0.2668, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 5.150009725617049, |
|
"learning_rate": 1.972515301446998e-05, |
|
"loss": 0.2854, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 6.331424258094408, |
|
"learning_rate": 1.972099402194736e-05, |
|
"loss": 0.2866, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 8.392412446366174, |
|
"learning_rate": 1.9716804243430176e-05, |
|
"loss": 0.2616, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 7.299549339702017, |
|
"learning_rate": 1.971258369218867e-05, |
|
"loss": 0.2983, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 6.2169745129545575, |
|
"learning_rate": 1.970833238159051e-05, |
|
"loss": 0.276, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 7.506632605972847, |
|
"learning_rate": 1.9704050325100827e-05, |
|
"loss": 0.2951, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 6.66483208527068, |
|
"learning_rate": 1.969973753628211e-05, |
|
"loss": 0.2784, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 7.211925104193477, |
|
"learning_rate": 1.9695394028794195e-05, |
|
"loss": 0.2729, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 4.20991234694906, |
|
"learning_rate": 1.9691019816394204e-05, |
|
"loss": 0.3152, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 5.975539574939649, |
|
"learning_rate": 1.9686614912936516e-05, |
|
"loss": 0.2747, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 6.135748637813934, |
|
"learning_rate": 1.968217933237272e-05, |
|
"loss": 0.3028, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 8.994912298940163, |
|
"learning_rate": 1.9677713088751562e-05, |
|
"loss": 0.3043, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 7.649871286543558, |
|
"learning_rate": 1.967321619621892e-05, |
|
"loss": 0.2577, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 6.035703921853307, |
|
"learning_rate": 1.9668688669017722e-05, |
|
"loss": 0.2596, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 5.4070428696843615, |
|
"learning_rate": 1.9664130521487946e-05, |
|
"loss": 0.2885, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 9.68037240943506, |
|
"learning_rate": 1.9659541768066545e-05, |
|
"loss": 0.2739, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 7.032775442165197, |
|
"learning_rate": 1.965492242328741e-05, |
|
"loss": 0.2832, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 7.038266627020968, |
|
"learning_rate": 1.9650272501781326e-05, |
|
"loss": 0.3053, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 4.469246363249616, |
|
"learning_rate": 1.9645592018275917e-05, |
|
"loss": 0.2922, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 9.997476259295432, |
|
"learning_rate": 1.964088098759561e-05, |
|
"loss": 0.3029, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 4.1359256203786705, |
|
"learning_rate": 1.9636139424661588e-05, |
|
"loss": 0.2885, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 6.857022727186512, |
|
"learning_rate": 1.9631367344491735e-05, |
|
"loss": 0.263, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 6.720261230840821, |
|
"learning_rate": 1.9626564762200583e-05, |
|
"loss": 0.3083, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 7.113731977499931, |
|
"learning_rate": 1.9621731692999284e-05, |
|
"loss": 0.2789, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 6.665634774774537, |
|
"learning_rate": 1.961686815219555e-05, |
|
"loss": 0.2591, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 4.599220599612653, |
|
"learning_rate": 1.9611974155193597e-05, |
|
"loss": 0.2753, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 6.564880520618788, |
|
"learning_rate": 1.960704971749411e-05, |
|
"loss": 0.2805, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 7.418663470463415, |
|
"learning_rate": 1.9602094854694194e-05, |
|
"loss": 0.2782, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 5.491847672130194, |
|
"learning_rate": 1.9597109582487313e-05, |
|
"loss": 0.2702, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 12.43956214256869, |
|
"learning_rate": 1.9592093916663242e-05, |
|
"loss": 0.2972, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 8.800236820155485, |
|
"learning_rate": 1.958704787310803e-05, |
|
"loss": 0.2725, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 6.644759742176537, |
|
"learning_rate": 1.9581971467803934e-05, |
|
"loss": 0.289, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 5.009336147526538, |
|
"learning_rate": 1.9576864716829377e-05, |
|
"loss": 0.2969, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 5.803503477935393, |
|
"learning_rate": 1.95717276363589e-05, |
|
"loss": 0.2774, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 5.894993950320594, |
|
"learning_rate": 1.95665602426631e-05, |
|
"loss": 0.2273, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 6.96986305003759, |
|
"learning_rate": 1.956136255210859e-05, |
|
"loss": 0.2736, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 9.605041419937288, |
|
"learning_rate": 1.955613458115793e-05, |
|
"loss": 0.2907, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 6.647244804794919, |
|
"learning_rate": 1.9550876346369615e-05, |
|
"loss": 0.261, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 5.607026269896423, |
|
"learning_rate": 1.9545587864397955e-05, |
|
"loss": 0.3143, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 13.142033450455475, |
|
"learning_rate": 1.954026915199309e-05, |
|
"loss": 0.2434, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 3.248788167531875, |
|
"learning_rate": 1.9534920226000902e-05, |
|
"loss": 0.2705, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 13.164180730181236, |
|
"learning_rate": 1.9529541103362962e-05, |
|
"loss": 0.2862, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 5.831970091880435, |
|
"learning_rate": 1.9524131801116487e-05, |
|
"loss": 0.3054, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 5.446448668681817, |
|
"learning_rate": 1.951869233639428e-05, |
|
"loss": 0.2671, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 6.302315229032403, |
|
"learning_rate": 1.951322272642468e-05, |
|
"loss": 0.2765, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 11.243376074569383, |
|
"learning_rate": 1.9507722988531502e-05, |
|
"loss": 0.2582, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 7.673489589122099, |
|
"learning_rate": 1.9502193140133983e-05, |
|
"loss": 0.3143, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 7.9949329541838265, |
|
"learning_rate": 1.9496633198746736e-05, |
|
"loss": 0.2862, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 4.907903307657898, |
|
"learning_rate": 1.9491043181979677e-05, |
|
"loss": 0.2926, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 8.67569902665337, |
|
"learning_rate": 1.9485423107537986e-05, |
|
"loss": 0.2741, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 7.103858232561379, |
|
"learning_rate": 1.9479772993222038e-05, |
|
"loss": 0.2767, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 8.73970254143099, |
|
"learning_rate": 1.947409285692736e-05, |
|
"loss": 0.232, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 5.799420807956918, |
|
"learning_rate": 1.946838271664457e-05, |
|
"loss": 0.286, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 5.575038878985263, |
|
"learning_rate": 1.9462642590459306e-05, |
|
"loss": 0.2361, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 5.718902752613272, |
|
"learning_rate": 1.9456872496552184e-05, |
|
"loss": 0.2781, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 6.156497233891847, |
|
"learning_rate": 1.9451072453198742e-05, |
|
"loss": 0.2798, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 8.158434812741351, |
|
"learning_rate": 1.9445242478769374e-05, |
|
"loss": 0.2629, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 4.100053898526847, |
|
"learning_rate": 1.9439382591729265e-05, |
|
"loss": 0.2616, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 10.398486291325238, |
|
"learning_rate": 1.9433492810638355e-05, |
|
"loss": 0.281, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 7.054531497011973, |
|
"learning_rate": 1.942757315415126e-05, |
|
"loss": 0.2899, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 5.42866324225203, |
|
"learning_rate": 1.9421623641017218e-05, |
|
"loss": 0.3102, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 7.133905299895763, |
|
"learning_rate": 1.941564429008004e-05, |
|
"loss": 0.2616, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 6.043327611859842, |
|
"learning_rate": 1.9409635120278035e-05, |
|
"loss": 0.2614, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 7.125098244508386, |
|
"learning_rate": 1.9403596150643957e-05, |
|
"loss": 0.2732, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 7.809924868565428, |
|
"learning_rate": 1.9397527400304944e-05, |
|
"loss": 0.2537, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 6.721114625225486, |
|
"learning_rate": 1.9391428888482466e-05, |
|
"loss": 0.2935, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 5.41764363012664, |
|
"learning_rate": 1.9385300634492244e-05, |
|
"loss": 0.2644, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 6.577839969793495, |
|
"learning_rate": 1.937914265774421e-05, |
|
"loss": 0.2822, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 5.446653006796048, |
|
"learning_rate": 1.9372954977742437e-05, |
|
"loss": 0.2767, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 6.276051357995197, |
|
"learning_rate": 1.9366737614085067e-05, |
|
"loss": 0.2693, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 5.1174083399984935, |
|
"learning_rate": 1.9360490586464265e-05, |
|
"loss": 0.2968, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 8.060800772518713, |
|
"learning_rate": 1.9354213914666154e-05, |
|
"loss": 0.3042, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 6.946903213944759, |
|
"learning_rate": 1.934790761857074e-05, |
|
"loss": 0.2896, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 6.9058705221323855, |
|
"learning_rate": 1.934157171815187e-05, |
|
"loss": 0.2697, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 6.140873173559298, |
|
"learning_rate": 1.9335206233477138e-05, |
|
"loss": 0.3012, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 6.2613108710926415, |
|
"learning_rate": 1.9328811184707857e-05, |
|
"loss": 0.2616, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 7.103058610195544, |
|
"learning_rate": 1.932238659209897e-05, |
|
"loss": 0.2476, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 7.721454841424903, |
|
"learning_rate": 1.9315932475998994e-05, |
|
"loss": 0.2772, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 7.2979455852743245, |
|
"learning_rate": 1.930944885684996e-05, |
|
"loss": 0.2463, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 5.194835577851161, |
|
"learning_rate": 1.9302935755187335e-05, |
|
"loss": 0.2595, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 8.826615087967348, |
|
"learning_rate": 1.9296393191639976e-05, |
|
"loss": 0.2728, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 4.8811686686091, |
|
"learning_rate": 1.9289821186930038e-05, |
|
"loss": 0.2998, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 6.458493860362177, |
|
"learning_rate": 1.9283219761872943e-05, |
|
"loss": 0.2608, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 7.035199086314618, |
|
"learning_rate": 1.9276588937377293e-05, |
|
"loss": 0.2789, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 5.133893925330738, |
|
"learning_rate": 1.9269928734444792e-05, |
|
"loss": 0.2858, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 6.888562228890644, |
|
"learning_rate": 1.9263239174170203e-05, |
|
"loss": 0.263, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 7.247178153358964, |
|
"learning_rate": 1.9256520277741276e-05, |
|
"loss": 0.2887, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 5.8972079378636755, |
|
"learning_rate": 1.9249772066438676e-05, |
|
"loss": 0.2693, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 4.962455745470868, |
|
"learning_rate": 1.924299456163591e-05, |
|
"loss": 0.2749, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 7.258216936978947, |
|
"learning_rate": 1.9236187784799267e-05, |
|
"loss": 0.2957, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 6.194723517380506, |
|
"learning_rate": 1.9229351757487757e-05, |
|
"loss": 0.2773, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 6.5263758820129505, |
|
"learning_rate": 1.9222486501353027e-05, |
|
"loss": 0.293, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 6.366073941639377, |
|
"learning_rate": 1.9215592038139296e-05, |
|
"loss": 0.2755, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 116.03864354978906, |
|
"learning_rate": 1.9208668389683308e-05, |
|
"loss": 0.251, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 5.264420821601751, |
|
"learning_rate": 1.9201715577914223e-05, |
|
"loss": 0.2845, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 7.40581166704595, |
|
"learning_rate": 1.9194733624853584e-05, |
|
"loss": 0.2632, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 5.941729193844859, |
|
"learning_rate": 1.918772255261523e-05, |
|
"loss": 0.2597, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 6.312855927658963, |
|
"learning_rate": 1.9180682383405227e-05, |
|
"loss": 0.2692, |
|
"step": 6440 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 6.071350558765477, |
|
"learning_rate": 1.9173613139521798e-05, |
|
"loss": 0.2731, |
|
"step": 6460 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 8.993502072541519, |
|
"learning_rate": 1.9166514843355254e-05, |
|
"loss": 0.2548, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 7.005495911170442, |
|
"learning_rate": 1.9159387517387924e-05, |
|
"loss": 0.2612, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 7.665844444744844, |
|
"learning_rate": 1.915223118419409e-05, |
|
"loss": 0.2501, |
|
"step": 6520 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 6.458334088069092, |
|
"learning_rate": 1.9145045866439892e-05, |
|
"loss": 0.2762, |
|
"step": 6540 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 6.819348453934704, |
|
"learning_rate": 1.9137831586883288e-05, |
|
"loss": 0.2826, |
|
"step": 6560 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 5.080353890354994, |
|
"learning_rate": 1.9130588368373958e-05, |
|
"loss": 0.2738, |
|
"step": 6580 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 5.836344060052037, |
|
"learning_rate": 1.912331623385324e-05, |
|
"loss": 0.2586, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 8.357013119166787, |
|
"learning_rate": 1.9116015206354067e-05, |
|
"loss": 0.3174, |
|
"step": 6620 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 4.307353833021694, |
|
"learning_rate": 1.9108685309000866e-05, |
|
"loss": 0.2721, |
|
"step": 6640 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 7.002584797605542, |
|
"learning_rate": 1.9101326565009517e-05, |
|
"loss": 0.2581, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 7.299065402050334, |
|
"learning_rate": 1.909393899768726e-05, |
|
"loss": 0.2933, |
|
"step": 6680 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 6.57009941780665, |
|
"learning_rate": 1.9086522630432638e-05, |
|
"loss": 0.2843, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 6.857683881351832, |
|
"learning_rate": 1.907907748673539e-05, |
|
"loss": 0.273, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 5.553536898181894, |
|
"learning_rate": 1.9071603590176417e-05, |
|
"loss": 0.2623, |
|
"step": 6740 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 6.30566096206076, |
|
"learning_rate": 1.906410096442768e-05, |
|
"loss": 0.2366, |
|
"step": 6760 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 10.352010603508685, |
|
"learning_rate": 1.9056569633252136e-05, |
|
"loss": 0.2546, |
|
"step": 6780 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 7.80077709337333, |
|
"learning_rate": 1.9049009620503663e-05, |
|
"loss": 0.2763, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 8.03821543687826, |
|
"learning_rate": 1.9041420950126976e-05, |
|
"loss": 0.2486, |
|
"step": 6820 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 5.390540434685423, |
|
"learning_rate": 1.9033803646157558e-05, |
|
"loss": 0.2964, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 7.430208109717449, |
|
"learning_rate": 1.9026157732721585e-05, |
|
"loss": 0.2681, |
|
"step": 6860 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 7.857246125768761, |
|
"learning_rate": 1.9018483234035845e-05, |
|
"loss": 0.2719, |
|
"step": 6880 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 5.565927371122475, |
|
"learning_rate": 1.901078017440767e-05, |
|
"loss": 0.2703, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 6.9976710123560375, |
|
"learning_rate": 1.9003048578234843e-05, |
|
"loss": 0.2566, |
|
"step": 6920 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 6.478229745256355, |
|
"learning_rate": 1.899528847000554e-05, |
|
"loss": 0.253, |
|
"step": 6940 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 5.116336291620225, |
|
"learning_rate": 1.898749987429823e-05, |
|
"loss": 0.2529, |
|
"step": 6960 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 5.587216274900646, |
|
"learning_rate": 1.8979682815781627e-05, |
|
"loss": 0.2848, |
|
"step": 6980 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 5.679239499396368, |
|
"learning_rate": 1.8971837319214586e-05, |
|
"loss": 0.2435, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 6.1169436663169074, |
|
"learning_rate": 1.8963963409446022e-05, |
|
"loss": 0.2793, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 5.802383972086084, |
|
"learning_rate": 1.8956061111414865e-05, |
|
"loss": 0.2717, |
|
"step": 7040 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 6.507485407821351, |
|
"learning_rate": 1.8948130450149942e-05, |
|
"loss": 0.3011, |
|
"step": 7060 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 5.9621692749685415, |
|
"learning_rate": 1.8940171450769924e-05, |
|
"loss": 0.3076, |
|
"step": 7080 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 54.193219403324576, |
|
"learning_rate": 1.8932184138483223e-05, |
|
"loss": 0.2629, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 7.104622864455409, |
|
"learning_rate": 1.8924168538587956e-05, |
|
"loss": 0.2714, |
|
"step": 7120 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 7.0480173880874615, |
|
"learning_rate": 1.8916124676471797e-05, |
|
"loss": 0.2736, |
|
"step": 7140 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 8.412429750327739, |
|
"learning_rate": 1.8908052577611958e-05, |
|
"loss": 0.2644, |
|
"step": 7160 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 8.78692031185526, |
|
"learning_rate": 1.8899952267575083e-05, |
|
"loss": 0.2402, |
|
"step": 7180 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 6.684638721458758, |
|
"learning_rate": 1.889182377201716e-05, |
|
"loss": 0.2542, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 6.010627553359556, |
|
"learning_rate": 1.8883667116683457e-05, |
|
"loss": 0.2838, |
|
"step": 7220 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 7.414666524064716, |
|
"learning_rate": 1.887548232740843e-05, |
|
"loss": 0.2851, |
|
"step": 7240 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 8.34076720224061, |
|
"learning_rate": 1.886726943011564e-05, |
|
"loss": 0.2516, |
|
"step": 7260 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 6.094640422225146, |
|
"learning_rate": 1.885902845081767e-05, |
|
"loss": 0.2313, |
|
"step": 7280 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 4.148097252407384, |
|
"learning_rate": 1.8850759415616066e-05, |
|
"loss": 0.2689, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 7.561235558598041, |
|
"learning_rate": 1.8842462350701212e-05, |
|
"loss": 0.2983, |
|
"step": 7320 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 7.87091273248736, |
|
"learning_rate": 1.883413728235228e-05, |
|
"loss": 0.2386, |
|
"step": 7340 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 6.197625708495748, |
|
"learning_rate": 1.8825784236937146e-05, |
|
"loss": 0.282, |
|
"step": 7360 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 9.524108011926973, |
|
"learning_rate": 1.8817403240912283e-05, |
|
"loss": 0.2776, |
|
"step": 7380 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 5.147943823246307, |
|
"learning_rate": 1.8808994320822693e-05, |
|
"loss": 0.2625, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 5.764109011612628, |
|
"learning_rate": 1.8800557503301827e-05, |
|
"loss": 0.2859, |
|
"step": 7420 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 7.7970156609149335, |
|
"learning_rate": 1.8792092815071498e-05, |
|
"loss": 0.2589, |
|
"step": 7440 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 6.220123292737489, |
|
"learning_rate": 1.8783600282941782e-05, |
|
"loss": 0.269, |
|
"step": 7460 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 6.875941264134116, |
|
"learning_rate": 1.877507993381096e-05, |
|
"loss": 0.2624, |
|
"step": 7480 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 5.721394912188018, |
|
"learning_rate": 1.8766531794665402e-05, |
|
"loss": 0.2571, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 6.99318335916291, |
|
"learning_rate": 1.8757955892579504e-05, |
|
"loss": 0.26, |
|
"step": 7520 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 6.692727585899676, |
|
"learning_rate": 1.87493522547156e-05, |
|
"loss": 0.2635, |
|
"step": 7540 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 6.5007755110350525, |
|
"learning_rate": 1.874072090832386e-05, |
|
"loss": 0.2754, |
|
"step": 7560 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 7.775379340923738, |
|
"learning_rate": 1.873206188074223e-05, |
|
"loss": 0.2708, |
|
"step": 7580 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 4.970941791912674, |
|
"learning_rate": 1.872337519939631e-05, |
|
"loss": 0.2592, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 7.276189396167904, |
|
"learning_rate": 1.8714660891799302e-05, |
|
"loss": 0.2648, |
|
"step": 7620 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 6.724776718800752, |
|
"learning_rate": 1.870591898555191e-05, |
|
"loss": 0.2606, |
|
"step": 7640 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 7.735768695454274, |
|
"learning_rate": 1.8697149508342237e-05, |
|
"loss": 0.2511, |
|
"step": 7660 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 5.05914779633595, |
|
"learning_rate": 1.868835248794573e-05, |
|
"loss": 0.2609, |
|
"step": 7680 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 8.031314122281715, |
|
"learning_rate": 1.8679527952225054e-05, |
|
"loss": 0.2718, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 5.619780768194464, |
|
"learning_rate": 1.867067592913004e-05, |
|
"loss": 0.2717, |
|
"step": 7720 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 7.595427904662886, |
|
"learning_rate": 1.8661796446697557e-05, |
|
"loss": 0.2536, |
|
"step": 7740 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 5.462276616537402, |
|
"learning_rate": 1.8652889533051473e-05, |
|
"loss": 0.2674, |
|
"step": 7760 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 5.302383844019715, |
|
"learning_rate": 1.864395521640252e-05, |
|
"loss": 0.2856, |
|
"step": 7780 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 4.703940083284321, |
|
"learning_rate": 1.8634993525048227e-05, |
|
"loss": 0.2609, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 6.585961827134786, |
|
"learning_rate": 1.862600448737283e-05, |
|
"loss": 0.265, |
|
"step": 7820 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 7.27689896277283, |
|
"learning_rate": 1.861698813184717e-05, |
|
"loss": 0.3018, |
|
"step": 7840 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 6.231232809733686, |
|
"learning_rate": 1.860794448702863e-05, |
|
"loss": 0.2268, |
|
"step": 7860 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 7.794911353272152, |
|
"learning_rate": 1.8598873581561e-05, |
|
"loss": 0.2632, |
|
"step": 7880 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 6.977335614708055, |
|
"learning_rate": 1.8589775444174436e-05, |
|
"loss": 0.3097, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 7.607942857642037, |
|
"learning_rate": 1.858065010368533e-05, |
|
"loss": 0.2658, |
|
"step": 7920 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 6.109669397778123, |
|
"learning_rate": 1.857149758899624e-05, |
|
"loss": 0.2613, |
|
"step": 7940 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 6.142102090556645, |
|
"learning_rate": 1.8562317929095796e-05, |
|
"loss": 0.2769, |
|
"step": 7960 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 3.98370343700879, |
|
"learning_rate": 1.8553111153058593e-05, |
|
"loss": 0.2642, |
|
"step": 7980 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 6.375900504146025, |
|
"learning_rate": 1.8543877290045122e-05, |
|
"loss": 0.2646, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 7.277577534154136, |
|
"learning_rate": 1.853461636930166e-05, |
|
"loss": 0.2806, |
|
"step": 8020 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 6.81435963858201, |
|
"learning_rate": 1.852532842016019e-05, |
|
"loss": 0.2536, |
|
"step": 8040 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 5.854006003712663, |
|
"learning_rate": 1.851601347203829e-05, |
|
"loss": 0.2447, |
|
"step": 8060 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 7.787886275359923, |
|
"learning_rate": 1.8506671554439064e-05, |
|
"loss": 0.2663, |
|
"step": 8080 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 9.21089898409568, |
|
"learning_rate": 1.849730269695103e-05, |
|
"loss": 0.2601, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 8.590889527489873, |
|
"learning_rate": 1.8487906929248028e-05, |
|
"loss": 0.2531, |
|
"step": 8120 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 5.551684548356732, |
|
"learning_rate": 1.8478484281089143e-05, |
|
"loss": 0.2605, |
|
"step": 8140 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 4.227848217032472, |
|
"learning_rate": 1.8469034782318585e-05, |
|
"loss": 0.2728, |
|
"step": 8160 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 6.015758166139706, |
|
"learning_rate": 1.8459558462865613e-05, |
|
"loss": 0.2883, |
|
"step": 8180 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 6.568658173678755, |
|
"learning_rate": 1.845005535274444e-05, |
|
"loss": 0.2454, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 5.985752674217696, |
|
"learning_rate": 1.844052548205412e-05, |
|
"loss": 0.2442, |
|
"step": 8220 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 6.3219678524060425, |
|
"learning_rate": 1.843096888097848e-05, |
|
"loss": 0.2912, |
|
"step": 8240 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 4.096257730243316, |
|
"learning_rate": 1.8421385579785997e-05, |
|
"loss": 0.2636, |
|
"step": 8260 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 6.396648972118899, |
|
"learning_rate": 1.8411775608829722e-05, |
|
"loss": 0.2324, |
|
"step": 8280 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 4.782379216505, |
|
"learning_rate": 1.8402138998547174e-05, |
|
"loss": 0.2675, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 8.676707198167653, |
|
"learning_rate": 1.839247577946025e-05, |
|
"loss": 0.2843, |
|
"step": 8320 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 5.39138478992206, |
|
"learning_rate": 1.8382785982175118e-05, |
|
"loss": 0.2742, |
|
"step": 8340 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 5.818443622984385, |
|
"learning_rate": 1.8373069637382136e-05, |
|
"loss": 0.26, |
|
"step": 8360 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 8.95366226368456, |
|
"learning_rate": 1.8363326775855737e-05, |
|
"loss": 0.2687, |
|
"step": 8380 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 7.96756080281063, |
|
"learning_rate": 1.8353557428454346e-05, |
|
"loss": 0.2425, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 6.577104865413394, |
|
"learning_rate": 1.8343761626120272e-05, |
|
"loss": 0.2688, |
|
"step": 8420 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 6.6269283727065, |
|
"learning_rate": 1.8333939399879617e-05, |
|
"loss": 0.2808, |
|
"step": 8440 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 7.016348140974161, |
|
"learning_rate": 1.8324090780842173e-05, |
|
"loss": 0.2511, |
|
"step": 8460 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 7.5454363034081116, |
|
"learning_rate": 1.831421580020133e-05, |
|
"loss": 0.252, |
|
"step": 8480 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 5.837760589468463, |
|
"learning_rate": 1.830431448923396e-05, |
|
"loss": 0.2728, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 6.154380243306325, |
|
"learning_rate": 1.8294386879300353e-05, |
|
"loss": 0.2867, |
|
"step": 8520 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 7.71122937485844, |
|
"learning_rate": 1.8284433001844073e-05, |
|
"loss": 0.2302, |
|
"step": 8540 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 6.86335128201322, |
|
"learning_rate": 1.8274452888391894e-05, |
|
"loss": 0.2586, |
|
"step": 8560 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 5.661853354206643, |
|
"learning_rate": 1.8264446570553682e-05, |
|
"loss": 0.2505, |
|
"step": 8580 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 5.982364804963667, |
|
"learning_rate": 1.82544140800223e-05, |
|
"loss": 0.2673, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 4.739028708176796, |
|
"learning_rate": 1.824435544857351e-05, |
|
"loss": 0.2678, |
|
"step": 8620 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 4.91420533377473, |
|
"learning_rate": 1.823427070806587e-05, |
|
"loss": 0.2559, |
|
"step": 8640 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 5.618249360419533, |
|
"learning_rate": 1.8224159890440623e-05, |
|
"loss": 0.2493, |
|
"step": 8660 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 5.896677808188606, |
|
"learning_rate": 1.821402302772162e-05, |
|
"loss": 0.2585, |
|
"step": 8680 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 6.073985124124518, |
|
"learning_rate": 1.82038601520152e-05, |
|
"loss": 0.2452, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 7.1459209410818, |
|
"learning_rate": 1.819367129551008e-05, |
|
"loss": 0.2592, |
|
"step": 8720 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 6.390094315335785, |
|
"learning_rate": 1.8183456490477287e-05, |
|
"loss": 0.2461, |
|
"step": 8740 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 5.294426005863845, |
|
"learning_rate": 1.8173215769270015e-05, |
|
"loss": 0.2685, |
|
"step": 8760 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 5.276924483715485, |
|
"learning_rate": 1.8162949164323554e-05, |
|
"loss": 0.2615, |
|
"step": 8780 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 7.331765382932756, |
|
"learning_rate": 1.8152656708155173e-05, |
|
"loss": 0.2828, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 5.361402122667844, |
|
"learning_rate": 1.8142338433364012e-05, |
|
"loss": 0.2849, |
|
"step": 8820 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 6.712375473487036, |
|
"learning_rate": 1.8131994372630995e-05, |
|
"loss": 0.2716, |
|
"step": 8840 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 8.103353922148388, |
|
"learning_rate": 1.812162455871872e-05, |
|
"loss": 0.2703, |
|
"step": 8860 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 4.585974100152074, |
|
"learning_rate": 1.8111229024471334e-05, |
|
"loss": 0.2386, |
|
"step": 8880 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 6.8332489132512375, |
|
"learning_rate": 1.8100807802814467e-05, |
|
"loss": 0.2935, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 5.556964992180211, |
|
"learning_rate": 1.80903609267551e-05, |
|
"loss": 0.2404, |
|
"step": 8920 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 6.524527124099894, |
|
"learning_rate": 1.8079888429381472e-05, |
|
"loss": 0.2477, |
|
"step": 8940 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 6.394125877212817, |
|
"learning_rate": 1.8069390343862972e-05, |
|
"loss": 0.2585, |
|
"step": 8960 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 7.212304875264878, |
|
"learning_rate": 1.805886670345003e-05, |
|
"loss": 0.2514, |
|
"step": 8980 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 5.915336602662839, |
|
"learning_rate": 1.8048317541474015e-05, |
|
"loss": 0.2554, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 6.204874325324116, |
|
"learning_rate": 1.803774289134714e-05, |
|
"loss": 0.2663, |
|
"step": 9020 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 4.9458264028130525, |
|
"learning_rate": 1.8027142786562334e-05, |
|
"loss": 0.2374, |
|
"step": 9040 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 5.66437734846908, |
|
"learning_rate": 1.8016517260693152e-05, |
|
"loss": 0.2173, |
|
"step": 9060 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 8.8145498502476, |
|
"learning_rate": 1.800586634739367e-05, |
|
"loss": 0.2672, |
|
"step": 9080 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 5.225621616310874, |
|
"learning_rate": 1.799519008039837e-05, |
|
"loss": 0.263, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 6.749141497235558, |
|
"learning_rate": 1.7984488493522033e-05, |
|
"loss": 0.294, |
|
"step": 9120 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 6.5925500148457115, |
|
"learning_rate": 1.7973761620659645e-05, |
|
"loss": 0.2549, |
|
"step": 9140 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 3.6612011894705097, |
|
"learning_rate": 1.7963009495786262e-05, |
|
"loss": 0.274, |
|
"step": 9160 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 7.730637018917763, |
|
"learning_rate": 1.795223215295694e-05, |
|
"loss": 0.2476, |
|
"step": 9180 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 5.253387992852078, |
|
"learning_rate": 1.7941429626306597e-05, |
|
"loss": 0.2557, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 6.185451592355014, |
|
"learning_rate": 1.7930601950049918e-05, |
|
"loss": 0.2414, |
|
"step": 9220 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 8.613330410148825, |
|
"learning_rate": 1.7919749158481238e-05, |
|
"loss": 0.252, |
|
"step": 9240 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 4.082779957130279, |
|
"learning_rate": 1.7908871285974452e-05, |
|
"loss": 0.246, |
|
"step": 9260 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 5.080789002249157, |
|
"learning_rate": 1.789796836698288e-05, |
|
"loss": 0.2241, |
|
"step": 9280 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 5.616004872409631, |
|
"learning_rate": 1.788704043603918e-05, |
|
"loss": 0.2635, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 4.6896605535132005, |
|
"learning_rate": 1.787608752775523e-05, |
|
"loss": 0.2496, |
|
"step": 9320 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 6.020003669712304, |
|
"learning_rate": 1.786510967682201e-05, |
|
"loss": 0.2742, |
|
"step": 9340 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 4.869330214670387, |
|
"learning_rate": 1.7854106918009516e-05, |
|
"loss": 0.2554, |
|
"step": 9360 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 5.463125770044224, |
|
"learning_rate": 1.7843079286166613e-05, |
|
"loss": 0.256, |
|
"step": 9380 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 12.859151326084799, |
|
"learning_rate": 1.7832026816220964e-05, |
|
"loss": 0.3044, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 7.462079888408213, |
|
"learning_rate": 1.7820949543178893e-05, |
|
"loss": 0.2603, |
|
"step": 9420 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 6.251675190537996, |
|
"learning_rate": 1.7809847502125287e-05, |
|
"loss": 0.2524, |
|
"step": 9440 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 20.20686096910179, |
|
"learning_rate": 1.779872072822348e-05, |
|
"loss": 0.2727, |
|
"step": 9460 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 11.117280832355938, |
|
"learning_rate": 1.7787569256715128e-05, |
|
"loss": 0.2751, |
|
"step": 9480 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 6.174365967852932, |
|
"learning_rate": 1.7776393122920136e-05, |
|
"loss": 0.2465, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 6.5845686642808205, |
|
"learning_rate": 1.7765192362236505e-05, |
|
"loss": 0.2637, |
|
"step": 9520 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 9.227894944405277, |
|
"learning_rate": 1.775396701014024e-05, |
|
"loss": 0.2594, |
|
"step": 9540 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 6.0294211980015255, |
|
"learning_rate": 1.7742717102185233e-05, |
|
"loss": 0.2506, |
|
"step": 9560 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 6.611585459356701, |
|
"learning_rate": 1.7731442674003153e-05, |
|
"loss": 0.256, |
|
"step": 9580 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 6.474013099428535, |
|
"learning_rate": 1.772014376130333e-05, |
|
"loss": 0.2509, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 4.050917561517386, |
|
"learning_rate": 1.7708820399872644e-05, |
|
"loss": 0.2597, |
|
"step": 9620 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 7.523512541811629, |
|
"learning_rate": 1.7697472625575415e-05, |
|
"loss": 0.2617, |
|
"step": 9640 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 4.674855993980255, |
|
"learning_rate": 1.768610047435328e-05, |
|
"loss": 0.2148, |
|
"step": 9660 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 3.581193699152847, |
|
"learning_rate": 1.7674703982225084e-05, |
|
"loss": 0.2485, |
|
"step": 9680 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 5.995347444394187, |
|
"learning_rate": 1.7663283185286778e-05, |
|
"loss": 0.2504, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 6.106039165812286, |
|
"learning_rate": 1.7651838119711278e-05, |
|
"loss": 0.2591, |
|
"step": 9720 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 5.544368037680747, |
|
"learning_rate": 1.7640368821748374e-05, |
|
"loss": 0.2589, |
|
"step": 9740 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 11.908781488384356, |
|
"learning_rate": 1.7628875327724604e-05, |
|
"loss": 0.24, |
|
"step": 9760 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 5.2162186199664005, |
|
"learning_rate": 1.761735767404314e-05, |
|
"loss": 0.279, |
|
"step": 9780 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 8.332009731717408, |
|
"learning_rate": 1.760581589718369e-05, |
|
"loss": 0.2523, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 6.811834460305066, |
|
"learning_rate": 1.759425003370234e-05, |
|
"loss": 0.2422, |
|
"step": 9820 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 10.001650864708848, |
|
"learning_rate": 1.758266012023149e-05, |
|
"loss": 0.2415, |
|
"step": 9840 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 14.181135321229519, |
|
"learning_rate": 1.7571046193479697e-05, |
|
"loss": 0.2439, |
|
"step": 9860 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 5.304371617930666, |
|
"learning_rate": 1.7559408290231582e-05, |
|
"loss": 0.2883, |
|
"step": 9880 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 10.159891549680514, |
|
"learning_rate": 1.754774644734771e-05, |
|
"loss": 0.2402, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 21.596871665189294, |
|
"learning_rate": 1.753606070176446e-05, |
|
"loss": 0.2646, |
|
"step": 9920 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 3.6266946448855064, |
|
"learning_rate": 1.752435109049392e-05, |
|
"loss": 0.2463, |
|
"step": 9940 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 7.461139967802549, |
|
"learning_rate": 1.7512617650623776e-05, |
|
"loss": 0.2343, |
|
"step": 9960 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 5.8844648373593955, |
|
"learning_rate": 1.7500860419317183e-05, |
|
"loss": 0.251, |
|
"step": 9980 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 9.038354738793856, |
|
"learning_rate": 1.7489079433812638e-05, |
|
"loss": 0.2494, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 8.591404154257724, |
|
"learning_rate": 1.7477274731423892e-05, |
|
"loss": 0.2374, |
|
"step": 10020 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 5.9870710947999815, |
|
"learning_rate": 1.7465446349539797e-05, |
|
"loss": 0.2206, |
|
"step": 10040 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 6.228813578147013, |
|
"learning_rate": 1.7453594325624224e-05, |
|
"loss": 0.2462, |
|
"step": 10060 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 5.257017078287017, |
|
"learning_rate": 1.7441718697215904e-05, |
|
"loss": 0.2409, |
|
"step": 10080 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 6.952956019716318, |
|
"learning_rate": 1.742981950192835e-05, |
|
"loss": 0.2521, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 5.5548892299756805, |
|
"learning_rate": 1.7417896777449706e-05, |
|
"loss": 0.2647, |
|
"step": 10120 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 5.73273030739662, |
|
"learning_rate": 1.7405950561542636e-05, |
|
"loss": 0.2473, |
|
"step": 10140 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 5.8226292447674775, |
|
"learning_rate": 1.7393980892044222e-05, |
|
"loss": 0.2799, |
|
"step": 10160 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 6.573153903103647, |
|
"learning_rate": 1.738198780686582e-05, |
|
"loss": 0.2391, |
|
"step": 10180 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 6.2081294015592094, |
|
"learning_rate": 1.7369971343992953e-05, |
|
"loss": 0.2441, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 7.239395541675969, |
|
"learning_rate": 1.735793154148519e-05, |
|
"loss": 0.2467, |
|
"step": 10220 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 6.574019720880623, |
|
"learning_rate": 1.7345868437476016e-05, |
|
"loss": 0.2742, |
|
"step": 10240 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 3.932079883792344, |
|
"learning_rate": 1.733378207017273e-05, |
|
"loss": 0.2799, |
|
"step": 10260 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 7.965596611059161, |
|
"learning_rate": 1.7321672477856297e-05, |
|
"loss": 0.268, |
|
"step": 10280 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 6.637332593742831, |
|
"learning_rate": 1.730953969888126e-05, |
|
"loss": 0.281, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 4.598400020154981, |
|
"learning_rate": 1.729738377167559e-05, |
|
"loss": 0.2688, |
|
"step": 10320 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 10.008276375495472, |
|
"learning_rate": 1.728520473474057e-05, |
|
"loss": 0.2424, |
|
"step": 10340 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 9.609588968019253, |
|
"learning_rate": 1.7273002626650693e-05, |
|
"loss": 0.2562, |
|
"step": 10360 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 6.246946580790647, |
|
"learning_rate": 1.726077748605352e-05, |
|
"loss": 0.2536, |
|
"step": 10380 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 5.207954250527354, |
|
"learning_rate": 1.724852935166955e-05, |
|
"loss": 0.2803, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 6.83554630577102, |
|
"learning_rate": 1.723625826229212e-05, |
|
"loss": 0.2366, |
|
"step": 10420 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 5.2741649888827, |
|
"learning_rate": 1.7223964256787275e-05, |
|
"loss": 0.2589, |
|
"step": 10440 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 4.504793580943435, |
|
"learning_rate": 1.7211647374093644e-05, |
|
"loss": 0.2654, |
|
"step": 10460 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 5.074320615196733, |
|
"learning_rate": 1.71993076532223e-05, |
|
"loss": 0.2531, |
|
"step": 10480 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 7.4921309833960645, |
|
"learning_rate": 1.7186945133256663e-05, |
|
"loss": 0.2452, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 4.773435701909952, |
|
"learning_rate": 1.7174559853352366e-05, |
|
"loss": 0.2786, |
|
"step": 10520 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 5.190944401366304, |
|
"learning_rate": 1.7162151852737114e-05, |
|
"loss": 0.2082, |
|
"step": 10540 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 6.8860794956428215, |
|
"learning_rate": 1.7149721170710597e-05, |
|
"loss": 0.2593, |
|
"step": 10560 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 5.315969613200098, |
|
"learning_rate": 1.7137267846644324e-05, |
|
"loss": 0.2451, |
|
"step": 10580 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 8.924983723943493, |
|
"learning_rate": 1.712479191998153e-05, |
|
"loss": 0.2487, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 4.785603454868163, |
|
"learning_rate": 1.711229343023703e-05, |
|
"loss": 0.275, |
|
"step": 10620 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 4.5511584473505895, |
|
"learning_rate": 1.709977241699711e-05, |
|
"loss": 0.2438, |
|
"step": 10640 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 6.601440573023448, |
|
"learning_rate": 1.7087228919919395e-05, |
|
"loss": 0.2682, |
|
"step": 10660 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 8.06521205975687, |
|
"learning_rate": 1.7074662978732713e-05, |
|
"loss": 0.2672, |
|
"step": 10680 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 5.877886448612562, |
|
"learning_rate": 1.7062074633236992e-05, |
|
"loss": 0.2415, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 6.00267509589556, |
|
"learning_rate": 1.704946392330311e-05, |
|
"loss": 0.245, |
|
"step": 10720 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 18.727472632503616, |
|
"learning_rate": 1.703683088887278e-05, |
|
"loss": 0.2527, |
|
"step": 10740 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 8.42578939933542, |
|
"learning_rate": 1.7024175569958435e-05, |
|
"loss": 0.2447, |
|
"step": 10760 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 15.871158165018187, |
|
"learning_rate": 1.7011498006643075e-05, |
|
"loss": 0.2611, |
|
"step": 10780 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 4.623538224443551, |
|
"learning_rate": 1.6998798239080167e-05, |
|
"loss": 0.2521, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 6.908983060916792, |
|
"learning_rate": 1.698607630749349e-05, |
|
"loss": 0.2298, |
|
"step": 10820 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 6.502465294111384, |
|
"learning_rate": 1.6973332252177036e-05, |
|
"loss": 0.2498, |
|
"step": 10840 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 4.978479228853818, |
|
"learning_rate": 1.6960566113494865e-05, |
|
"loss": 0.252, |
|
"step": 10860 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 5.650381173298351, |
|
"learning_rate": 1.694777793188098e-05, |
|
"loss": 0.2288, |
|
"step": 10880 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 7.073746360539243, |
|
"learning_rate": 1.6934967747839202e-05, |
|
"loss": 0.2519, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 5.927901369661737, |
|
"learning_rate": 1.6922135601943037e-05, |
|
"loss": 0.265, |
|
"step": 10920 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 5.53567758715019, |
|
"learning_rate": 1.690928153483555e-05, |
|
"loss": 0.25, |
|
"step": 10940 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 7.570944618942586, |
|
"learning_rate": 1.6896405587229247e-05, |
|
"loss": 0.2549, |
|
"step": 10960 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 7.379565103013804, |
|
"learning_rate": 1.6883507799905922e-05, |
|
"loss": 0.2363, |
|
"step": 10980 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 9.023229502472875, |
|
"learning_rate": 1.6870588213716555e-05, |
|
"loss": 0.2832, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 5.6792260655491855, |
|
"learning_rate": 1.6857646869581153e-05, |
|
"loss": 0.228, |
|
"step": 11020 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 7.456793627942026, |
|
"learning_rate": 1.6844683808488647e-05, |
|
"loss": 0.2494, |
|
"step": 11040 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 4.8011477449229885, |
|
"learning_rate": 1.6831699071496758e-05, |
|
"loss": 0.2634, |
|
"step": 11060 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 6.58057290965885, |
|
"learning_rate": 1.681869269973184e-05, |
|
"loss": 0.2577, |
|
"step": 11080 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 5.68008811828603, |
|
"learning_rate": 1.68056647343888e-05, |
|
"loss": 0.2297, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 6.528010244716758, |
|
"learning_rate": 1.6792615216730907e-05, |
|
"loss": 0.2196, |
|
"step": 11120 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 5.853566456861371, |
|
"learning_rate": 1.6779544188089715e-05, |
|
"loss": 0.2629, |
|
"step": 11140 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 10.986926893405414, |
|
"learning_rate": 1.67664516898649e-05, |
|
"loss": 0.2302, |
|
"step": 11160 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 7.730824034913035, |
|
"learning_rate": 1.6753337763524137e-05, |
|
"loss": 0.2336, |
|
"step": 11180 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 7.922173067463235, |
|
"learning_rate": 1.6740202450602976e-05, |
|
"loss": 0.2686, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 5.406865255814246, |
|
"learning_rate": 1.67270457927047e-05, |
|
"loss": 0.226, |
|
"step": 11220 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 6.843481049848729, |
|
"learning_rate": 1.6713867831500195e-05, |
|
"loss": 0.2586, |
|
"step": 11240 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 5.49549924323287, |
|
"learning_rate": 1.670066860872783e-05, |
|
"loss": 0.2627, |
|
"step": 11260 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 6.183808429627808, |
|
"learning_rate": 1.6687448166193306e-05, |
|
"loss": 0.2749, |
|
"step": 11280 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 4.378810204329709, |
|
"learning_rate": 1.667420654576954e-05, |
|
"loss": 0.2558, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 6.028002244995752, |
|
"learning_rate": 1.666094378939652e-05, |
|
"loss": 0.2554, |
|
"step": 11320 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 7.776788987779546, |
|
"learning_rate": 1.664765993908118e-05, |
|
"loss": 0.2326, |
|
"step": 11340 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 7.503277380435426, |
|
"learning_rate": 1.663435503689726e-05, |
|
"loss": 0.2707, |
|
"step": 11360 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 6.303861845235693, |
|
"learning_rate": 1.6621029124985195e-05, |
|
"loss": 0.2435, |
|
"step": 11380 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 7.213728574312154, |
|
"learning_rate": 1.6607682245551935e-05, |
|
"loss": 0.2514, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 5.2552293437415525, |
|
"learning_rate": 1.6594314440870864e-05, |
|
"loss": 0.2397, |
|
"step": 11420 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 6.538249814157013, |
|
"learning_rate": 1.6580925753281634e-05, |
|
"loss": 0.2655, |
|
"step": 11440 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 5.2378821622768905, |
|
"learning_rate": 1.6567516225190035e-05, |
|
"loss": 0.2607, |
|
"step": 11460 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 5.674850314010563, |
|
"learning_rate": 1.655408589906787e-05, |
|
"loss": 0.2723, |
|
"step": 11480 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 7.192949169932349, |
|
"learning_rate": 1.654063481745281e-05, |
|
"loss": 0.2561, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 13.135993930717675, |
|
"learning_rate": 1.652716302294828e-05, |
|
"loss": 0.2382, |
|
"step": 11520 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 4.887607996691356, |
|
"learning_rate": 1.651367055822329e-05, |
|
"loss": 0.2863, |
|
"step": 11540 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 7.367579978609729, |
|
"learning_rate": 1.6500157466012324e-05, |
|
"loss": 0.2379, |
|
"step": 11560 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 8.199270857981157, |
|
"learning_rate": 1.6486623789115205e-05, |
|
"loss": 0.2432, |
|
"step": 11580 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 6.243091274334211, |
|
"learning_rate": 1.6473069570396942e-05, |
|
"loss": 0.2635, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 5.6352137765892545, |
|
"learning_rate": 1.6459494852787622e-05, |
|
"loss": 0.2292, |
|
"step": 11620 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 5.2104929401235305, |
|
"learning_rate": 1.6445899679282248e-05, |
|
"loss": 0.2545, |
|
"step": 11640 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 5.635847694521193, |
|
"learning_rate": 1.6432284092940607e-05, |
|
"loss": 0.247, |
|
"step": 11660 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 5.853851889115171, |
|
"learning_rate": 1.6418648136887152e-05, |
|
"loss": 0.2323, |
|
"step": 11680 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 4.98208977143132, |
|
"learning_rate": 1.6404991854310846e-05, |
|
"loss": 0.238, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 5.560280174770714, |
|
"learning_rate": 1.6391315288465027e-05, |
|
"loss": 0.2589, |
|
"step": 11720 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 11.332988584174231, |
|
"learning_rate": 1.637761848266729e-05, |
|
"loss": 0.2437, |
|
"step": 11740 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 13.079688339953384, |
|
"learning_rate": 1.6363901480299323e-05, |
|
"loss": 0.2489, |
|
"step": 11760 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 6.852537601204953, |
|
"learning_rate": 1.6350164324806787e-05, |
|
"loss": 0.218, |
|
"step": 11780 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 6.384240727219325, |
|
"learning_rate": 1.633640705969917e-05, |
|
"loss": 0.2419, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 4.348764283501352, |
|
"learning_rate": 1.6322629728549665e-05, |
|
"loss": 0.2037, |
|
"step": 11820 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 5.096264739138052, |
|
"learning_rate": 1.6308832374994997e-05, |
|
"loss": 0.2502, |
|
"step": 11840 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 4.471177088927129, |
|
"learning_rate": 1.6295015042735336e-05, |
|
"loss": 0.2435, |
|
"step": 11860 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 7.886308089698534, |
|
"learning_rate": 1.6281177775534106e-05, |
|
"loss": 0.2367, |
|
"step": 11880 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 5.0872043608074335, |
|
"learning_rate": 1.6267320617217886e-05, |
|
"loss": 0.2618, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 7.332403239597943, |
|
"learning_rate": 1.6253443611676247e-05, |
|
"loss": 0.2377, |
|
"step": 11920 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 5.2156408493688, |
|
"learning_rate": 1.6239546802861628e-05, |
|
"loss": 0.2588, |
|
"step": 11940 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 14.389605988283588, |
|
"learning_rate": 1.6225630234789186e-05, |
|
"loss": 0.2359, |
|
"step": 11960 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 6.61108607154756, |
|
"learning_rate": 1.621169395153666e-05, |
|
"loss": 0.2454, |
|
"step": 11980 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 5.92623925749379, |
|
"learning_rate": 1.6197737997244242e-05, |
|
"loss": 0.2504, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 6.729876438497323, |
|
"learning_rate": 1.6183762416114417e-05, |
|
"loss": 0.231, |
|
"step": 12020 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 4.91119912664639, |
|
"learning_rate": 1.6169767252411843e-05, |
|
"loss": 0.2732, |
|
"step": 12040 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 7.372474108547359, |
|
"learning_rate": 1.615575255046319e-05, |
|
"loss": 0.2396, |
|
"step": 12060 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 4.844310112839635, |
|
"learning_rate": 1.6141718354657023e-05, |
|
"loss": 0.2682, |
|
"step": 12080 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 7.827541428550464, |
|
"learning_rate": 1.6127664709443642e-05, |
|
"loss": 0.2351, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 6.394194783450918, |
|
"learning_rate": 1.6113591659334952e-05, |
|
"loss": 0.277, |
|
"step": 12120 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 6.728544539125102, |
|
"learning_rate": 1.609949924890432e-05, |
|
"loss": 0.2517, |
|
"step": 12140 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 4.095514979882195, |
|
"learning_rate": 1.6085387522786432e-05, |
|
"loss": 0.2317, |
|
"step": 12160 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 6.899190893971197, |
|
"learning_rate": 1.6071256525677144e-05, |
|
"loss": 0.239, |
|
"step": 12180 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 5.002813882583922, |
|
"learning_rate": 1.6057106302333366e-05, |
|
"loss": 0.2411, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 6.7562128367712, |
|
"learning_rate": 1.6042936897572883e-05, |
|
"loss": 0.2347, |
|
"step": 12220 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 9.896004658604653, |
|
"learning_rate": 1.6028748356274247e-05, |
|
"loss": 0.2526, |
|
"step": 12240 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 7.972800268940516, |
|
"learning_rate": 1.6014540723376623e-05, |
|
"loss": 0.2505, |
|
"step": 12260 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 5.170343546862058, |
|
"learning_rate": 1.600031404387963e-05, |
|
"loss": 0.2478, |
|
"step": 12280 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 6.356344714814083, |
|
"learning_rate": 1.5986068362843224e-05, |
|
"loss": 0.2767, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 6.20794198597022, |
|
"learning_rate": 1.5971803725387544e-05, |
|
"loss": 0.2533, |
|
"step": 12320 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 7.368279449995274, |
|
"learning_rate": 1.5957520176692766e-05, |
|
"loss": 0.2706, |
|
"step": 12340 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 9.218421438795374, |
|
"learning_rate": 1.594321776199896e-05, |
|
"loss": 0.2447, |
|
"step": 12360 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 5.4653346268657845, |
|
"learning_rate": 1.592889652660596e-05, |
|
"loss": 0.2339, |
|
"step": 12380 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 6.741041667370887, |
|
"learning_rate": 1.5914556515873197e-05, |
|
"loss": 0.1749, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 4.207049838195936, |
|
"learning_rate": 1.590019777521959e-05, |
|
"loss": 0.1849, |
|
"step": 12420 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 7.1496607666636285, |
|
"learning_rate": 1.588582035012336e-05, |
|
"loss": 0.1743, |
|
"step": 12440 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 7.5265979882421865, |
|
"learning_rate": 1.587142428612191e-05, |
|
"loss": 0.1868, |
|
"step": 12460 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 5.651063343012383, |
|
"learning_rate": 1.5857009628811692e-05, |
|
"loss": 0.1983, |
|
"step": 12480 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 9.202976607727676, |
|
"learning_rate": 1.5842576423848034e-05, |
|
"loss": 0.1917, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 5.832342590483985, |
|
"learning_rate": 1.582812471694501e-05, |
|
"loss": 0.2189, |
|
"step": 12520 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 6.20925991986496, |
|
"learning_rate": 1.5813654553875307e-05, |
|
"loss": 0.1941, |
|
"step": 12540 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 6.9734995441552865, |
|
"learning_rate": 1.579916598047006e-05, |
|
"loss": 0.1722, |
|
"step": 12560 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 5.261181866981142, |
|
"learning_rate": 1.578465904261871e-05, |
|
"loss": 0.1841, |
|
"step": 12580 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 6.347552583288099, |
|
"learning_rate": 1.5770133786268867e-05, |
|
"loss": 0.2178, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 5.354096329322261, |
|
"learning_rate": 1.5755590257426172e-05, |
|
"loss": 0.2037, |
|
"step": 12620 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 6.433760955249804, |
|
"learning_rate": 1.5741028502154122e-05, |
|
"loss": 0.1918, |
|
"step": 12640 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 10.724559043942634, |
|
"learning_rate": 1.572644856657396e-05, |
|
"loss": 0.1943, |
|
"step": 12660 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 7.14880036647321, |
|
"learning_rate": 1.571185049686449e-05, |
|
"loss": 0.1971, |
|
"step": 12680 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 5.58282494958642, |
|
"learning_rate": 1.5697234339261973e-05, |
|
"loss": 0.2066, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 5.049152250234184, |
|
"learning_rate": 1.5682600140059945e-05, |
|
"loss": 0.2155, |
|
"step": 12720 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 5.577269074409907, |
|
"learning_rate": 1.5667947945609098e-05, |
|
"loss": 0.2307, |
|
"step": 12740 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 7.3636602086666905, |
|
"learning_rate": 1.5653277802317107e-05, |
|
"loss": 0.1904, |
|
"step": 12760 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 5.824392969812123, |
|
"learning_rate": 1.5638589756648507e-05, |
|
"loss": 0.1796, |
|
"step": 12780 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 6.363241683808851, |
|
"learning_rate": 1.562388385512452e-05, |
|
"loss": 0.1792, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 8.101137773642606, |
|
"learning_rate": 1.560916014432294e-05, |
|
"loss": 0.1934, |
|
"step": 12820 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 4.945106731069112, |
|
"learning_rate": 1.559441867087796e-05, |
|
"loss": 0.2209, |
|
"step": 12840 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 6.2605136180443495, |
|
"learning_rate": 1.5579659481480026e-05, |
|
"loss": 0.1781, |
|
"step": 12860 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 7.849809107312115, |
|
"learning_rate": 1.5564882622875715e-05, |
|
"loss": 0.1772, |
|
"step": 12880 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 6.076234028129562, |
|
"learning_rate": 1.5550088141867542e-05, |
|
"loss": 0.1798, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 8.417089571258343, |
|
"learning_rate": 1.553527608531386e-05, |
|
"loss": 0.2224, |
|
"step": 12920 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 5.434386315534151, |
|
"learning_rate": 1.5520446500128666e-05, |
|
"loss": 0.1751, |
|
"step": 12940 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 7.365658808789612, |
|
"learning_rate": 1.55055994332815e-05, |
|
"loss": 0.216, |
|
"step": 12960 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 6.124958583146801, |
|
"learning_rate": 1.5490734931797252e-05, |
|
"loss": 0.1785, |
|
"step": 12980 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 6.345434106235919, |
|
"learning_rate": 1.5475853042756045e-05, |
|
"loss": 0.2129, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 10.0344020371502, |
|
"learning_rate": 1.5460953813293065e-05, |
|
"loss": 0.178, |
|
"step": 13020 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 7.7502114051780016, |
|
"learning_rate": 1.544603729059842e-05, |
|
"loss": 0.1777, |
|
"step": 13040 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 6.198622753624231, |
|
"learning_rate": 1.5431103521916996e-05, |
|
"loss": 0.2098, |
|
"step": 13060 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 6.48127602670386, |
|
"learning_rate": 1.5416152554548302e-05, |
|
"loss": 0.164, |
|
"step": 13080 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 7.600239308253696, |
|
"learning_rate": 1.5401184435846316e-05, |
|
"loss": 0.1847, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 6.423081036482468, |
|
"learning_rate": 1.5386199213219344e-05, |
|
"loss": 0.1873, |
|
"step": 13120 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 7.1989659944439355, |
|
"learning_rate": 1.5371196934129854e-05, |
|
"loss": 0.2092, |
|
"step": 13140 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 4.613997491830078, |
|
"learning_rate": 1.5356177646094348e-05, |
|
"loss": 0.1882, |
|
"step": 13160 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 5.629794641682726, |
|
"learning_rate": 1.5341141396683202e-05, |
|
"loss": 0.1952, |
|
"step": 13180 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 5.86222977330632, |
|
"learning_rate": 1.53260882335205e-05, |
|
"loss": 0.1857, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 5.390116349700223, |
|
"learning_rate": 1.5311018204283915e-05, |
|
"loss": 0.1862, |
|
"step": 13220 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 4.734598991710353, |
|
"learning_rate": 1.5295931356704522e-05, |
|
"loss": 0.1922, |
|
"step": 13240 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 6.44238392273467, |
|
"learning_rate": 1.5280827738566673e-05, |
|
"loss": 0.1823, |
|
"step": 13260 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 4.314282919486737, |
|
"learning_rate": 1.5265707397707838e-05, |
|
"loss": 0.1904, |
|
"step": 13280 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 6.471785123561109, |
|
"learning_rate": 1.525057038201845e-05, |
|
"loss": 0.2201, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 6.211228619356565, |
|
"learning_rate": 1.523541673944176e-05, |
|
"loss": 0.1941, |
|
"step": 13320 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 5.109706482939786, |
|
"learning_rate": 1.5220246517973674e-05, |
|
"loss": 0.205, |
|
"step": 13340 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 7.1474883569847405, |
|
"learning_rate": 1.5205059765662611e-05, |
|
"loss": 0.1863, |
|
"step": 13360 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 10.992853444090926, |
|
"learning_rate": 1.5189856530609351e-05, |
|
"loss": 0.2029, |
|
"step": 13380 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 5.481913913723081, |
|
"learning_rate": 1.517463686096688e-05, |
|
"loss": 0.2004, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 5.850124859903834, |
|
"learning_rate": 1.5159400804940232e-05, |
|
"loss": 0.2029, |
|
"step": 13420 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 5.113867042039441, |
|
"learning_rate": 1.5144148410786344e-05, |
|
"loss": 0.2166, |
|
"step": 13440 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 6.039714348714059, |
|
"learning_rate": 1.51288797268139e-05, |
|
"loss": 0.2116, |
|
"step": 13460 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 7.541603158363756, |
|
"learning_rate": 1.5113594801383178e-05, |
|
"loss": 0.1925, |
|
"step": 13480 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 7.479663488804942, |
|
"learning_rate": 1.50982936829059e-05, |
|
"loss": 0.1953, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 6.7557856877486175, |
|
"learning_rate": 1.5082976419845078e-05, |
|
"loss": 0.1976, |
|
"step": 13520 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 5.931350253738143, |
|
"learning_rate": 1.5067643060714844e-05, |
|
"loss": 0.2133, |
|
"step": 13540 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 6.494971457661676, |
|
"learning_rate": 1.5052293654080332e-05, |
|
"loss": 0.176, |
|
"step": 13560 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 5.762783270305054, |
|
"learning_rate": 1.503692824855749e-05, |
|
"loss": 0.2096, |
|
"step": 13580 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 6.848342260542276, |
|
"learning_rate": 1.5021546892812934e-05, |
|
"loss": 0.2034, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 5.6448287727059485, |
|
"learning_rate": 1.5006149635563817e-05, |
|
"loss": 0.1936, |
|
"step": 13620 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 5.83680549100651, |
|
"learning_rate": 1.4990736525577642e-05, |
|
"loss": 0.2025, |
|
"step": 13640 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 7.41015135946399, |
|
"learning_rate": 1.4975307611672127e-05, |
|
"loss": 0.2024, |
|
"step": 13660 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 9.778277740797297, |
|
"learning_rate": 1.4959862942715043e-05, |
|
"loss": 0.1707, |
|
"step": 13680 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 5.046412396561728, |
|
"learning_rate": 1.4944402567624065e-05, |
|
"loss": 0.1936, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 11.086870666327583, |
|
"learning_rate": 1.492892653536661e-05, |
|
"loss": 0.1979, |
|
"step": 13720 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 6.0254596329111525, |
|
"learning_rate": 1.4913434894959693e-05, |
|
"loss": 0.1791, |
|
"step": 13740 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 7.033701558289489, |
|
"learning_rate": 1.4897927695469756e-05, |
|
"loss": 0.1905, |
|
"step": 13760 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 4.474543730422018, |
|
"learning_rate": 1.4882404986012523e-05, |
|
"loss": 0.1693, |
|
"step": 13780 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 4.1690691824315405, |
|
"learning_rate": 1.4866866815752847e-05, |
|
"loss": 0.1856, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 5.6756955564977964, |
|
"learning_rate": 1.4851313233904547e-05, |
|
"loss": 0.2053, |
|
"step": 13820 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 11.164112387266075, |
|
"learning_rate": 1.4835744289730252e-05, |
|
"loss": 0.171, |
|
"step": 13840 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 7.308139120179797, |
|
"learning_rate": 1.4820160032541254e-05, |
|
"loss": 0.1954, |
|
"step": 13860 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 3.5914657630993294, |
|
"learning_rate": 1.4804560511697341e-05, |
|
"loss": 0.2246, |
|
"step": 13880 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 6.751209857032397, |
|
"learning_rate": 1.4788945776606647e-05, |
|
"loss": 0.2013, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 6.405950176387068, |
|
"learning_rate": 1.477331587672549e-05, |
|
"loss": 0.2113, |
|
"step": 13920 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 6.376328572976509, |
|
"learning_rate": 1.4757670861558228e-05, |
|
"loss": 0.1924, |
|
"step": 13940 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 9.121068656282398, |
|
"learning_rate": 1.4742010780657085e-05, |
|
"loss": 0.209, |
|
"step": 13960 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 4.7626419486771026, |
|
"learning_rate": 1.4726335683622008e-05, |
|
"loss": 0.2255, |
|
"step": 13980 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 6.817788687267193, |
|
"learning_rate": 1.4710645620100499e-05, |
|
"loss": 0.1896, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 7.332059084839946, |
|
"learning_rate": 1.4694940639787466e-05, |
|
"loss": 0.2066, |
|
"step": 14020 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 5.640954681652653, |
|
"learning_rate": 1.4679220792425067e-05, |
|
"loss": 0.1771, |
|
"step": 14040 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 6.095651183293601, |
|
"learning_rate": 1.4663486127802538e-05, |
|
"loss": 0.186, |
|
"step": 14060 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 7.092095404760473, |
|
"learning_rate": 1.464773669575606e-05, |
|
"loss": 0.2142, |
|
"step": 14080 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 4.837369592766958, |
|
"learning_rate": 1.463197254616857e-05, |
|
"loss": 0.2218, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 4.9264497356291646, |
|
"learning_rate": 1.4616193728969633e-05, |
|
"loss": 0.1938, |
|
"step": 14120 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 9.674187157366852, |
|
"learning_rate": 1.4600400294135264e-05, |
|
"loss": 0.2098, |
|
"step": 14140 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 6.437523999215732, |
|
"learning_rate": 1.4584592291687777e-05, |
|
"loss": 0.2029, |
|
"step": 14160 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 6.823030504825212, |
|
"learning_rate": 1.4568769771695625e-05, |
|
"loss": 0.1877, |
|
"step": 14180 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 7.082752727573593, |
|
"learning_rate": 1.4552932784273246e-05, |
|
"loss": 0.1928, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 11.029950376037581, |
|
"learning_rate": 1.45370813795809e-05, |
|
"loss": 0.1682, |
|
"step": 14220 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 4.956260098113924, |
|
"learning_rate": 1.4521215607824499e-05, |
|
"loss": 0.1972, |
|
"step": 14240 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 5.51023882727362, |
|
"learning_rate": 1.4505335519255482e-05, |
|
"loss": 0.1967, |
|
"step": 14260 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 6.160472680324094, |
|
"learning_rate": 1.4489441164170612e-05, |
|
"loss": 0.1913, |
|
"step": 14280 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 9.72267194404117, |
|
"learning_rate": 1.447353259291185e-05, |
|
"loss": 0.1818, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 8.889058328709032, |
|
"learning_rate": 1.4457609855866181e-05, |
|
"loss": 0.2082, |
|
"step": 14320 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 3.973002280859937, |
|
"learning_rate": 1.4441673003465458e-05, |
|
"loss": 0.1851, |
|
"step": 14340 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 6.797652557445699, |
|
"learning_rate": 1.4425722086186236e-05, |
|
"loss": 0.191, |
|
"step": 14360 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 7.229624826239175, |
|
"learning_rate": 1.4409757154549621e-05, |
|
"loss": 0.1891, |
|
"step": 14380 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 6.741986922621174, |
|
"learning_rate": 1.4393778259121113e-05, |
|
"loss": 0.1868, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 4.781423328723562, |
|
"learning_rate": 1.4377785450510426e-05, |
|
"loss": 0.1953, |
|
"step": 14420 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 5.074661529228375, |
|
"learning_rate": 1.436177877937135e-05, |
|
"loss": 0.2004, |
|
"step": 14440 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 8.033180093952518, |
|
"learning_rate": 1.4345758296401585e-05, |
|
"loss": 0.1816, |
|
"step": 14460 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 5.974169762042515, |
|
"learning_rate": 1.4329724052342569e-05, |
|
"loss": 0.192, |
|
"step": 14480 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 4.656455835490763, |
|
"learning_rate": 1.4313676097979326e-05, |
|
"loss": 0.1835, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 3.1936684555124617, |
|
"learning_rate": 1.4297614484140307e-05, |
|
"loss": 0.1808, |
|
"step": 14520 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 8.20373974631054, |
|
"learning_rate": 1.4281539261697228e-05, |
|
"loss": 0.1836, |
|
"step": 14540 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 6.186778941670351, |
|
"learning_rate": 1.4265450481564904e-05, |
|
"loss": 0.1946, |
|
"step": 14560 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 6.498458092545216, |
|
"learning_rate": 1.4249348194701091e-05, |
|
"loss": 0.1883, |
|
"step": 14580 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 4.671798654398146, |
|
"learning_rate": 1.4233232452106331e-05, |
|
"loss": 0.1981, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 6.939397878773192, |
|
"learning_rate": 1.4217103304823774e-05, |
|
"loss": 0.1858, |
|
"step": 14620 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 9.00976329632259, |
|
"learning_rate": 1.4200960803939034e-05, |
|
"loss": 0.1917, |
|
"step": 14640 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 6.559576346045498, |
|
"learning_rate": 1.4184805000580018e-05, |
|
"loss": 0.1915, |
|
"step": 14660 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 5.306030822468831, |
|
"learning_rate": 1.4168635945916762e-05, |
|
"loss": 0.2023, |
|
"step": 14680 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 6.917396077987428, |
|
"learning_rate": 1.4152453691161279e-05, |
|
"loss": 0.201, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 5.388833984060343, |
|
"learning_rate": 1.4136258287567386e-05, |
|
"loss": 0.1951, |
|
"step": 14720 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 5.741268992759075, |
|
"learning_rate": 1.412004978643055e-05, |
|
"loss": 0.2052, |
|
"step": 14740 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 8.528637831977798, |
|
"learning_rate": 1.4103828239087713e-05, |
|
"loss": 0.1911, |
|
"step": 14760 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 7.229831133673562, |
|
"learning_rate": 1.4087593696917152e-05, |
|
"loss": 0.2147, |
|
"step": 14780 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 8.453977418463722, |
|
"learning_rate": 1.4071346211338287e-05, |
|
"loss": 0.2056, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 4.0493875016839205, |
|
"learning_rate": 1.4055085833811543e-05, |
|
"loss": 0.1875, |
|
"step": 14820 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 6.315957510536494, |
|
"learning_rate": 1.403881261583818e-05, |
|
"loss": 0.2049, |
|
"step": 14840 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 6.174807081413619, |
|
"learning_rate": 1.4022526608960117e-05, |
|
"loss": 0.1887, |
|
"step": 14860 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 5.128483960445226, |
|
"learning_rate": 1.4006227864759787e-05, |
|
"loss": 0.1958, |
|
"step": 14880 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 6.592332443520915, |
|
"learning_rate": 1.3989916434859961e-05, |
|
"loss": 0.174, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 5.47341406161348, |
|
"learning_rate": 1.3973592370923594e-05, |
|
"loss": 0.1972, |
|
"step": 14920 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 10.100193860299168, |
|
"learning_rate": 1.395725572465366e-05, |
|
"loss": 0.217, |
|
"step": 14940 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 5.393687564139285, |
|
"learning_rate": 1.394090654779297e-05, |
|
"loss": 0.1746, |
|
"step": 14960 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 4.862848248796205, |
|
"learning_rate": 1.3924544892124037e-05, |
|
"loss": 0.1804, |
|
"step": 14980 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 5.347965295637652, |
|
"learning_rate": 1.390817080946889e-05, |
|
"loss": 0.1774, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 6.040766822188126, |
|
"learning_rate": 1.3891784351688921e-05, |
|
"loss": 0.2123, |
|
"step": 15020 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 5.8662001265351105, |
|
"learning_rate": 1.3875385570684725e-05, |
|
"loss": 0.1888, |
|
"step": 15040 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 3.4039196477297584, |
|
"learning_rate": 1.3858974518395912e-05, |
|
"loss": 0.1776, |
|
"step": 15060 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 5.345588055655486, |
|
"learning_rate": 1.384255124680097e-05, |
|
"loss": 0.1934, |
|
"step": 15080 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 7.7672522699283455, |
|
"learning_rate": 1.3826115807917088e-05, |
|
"loss": 0.1881, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 14.607170518002183, |
|
"learning_rate": 1.3809668253799989e-05, |
|
"loss": 0.1992, |
|
"step": 15120 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 7.122439543446391, |
|
"learning_rate": 1.379320863654377e-05, |
|
"loss": 0.2071, |
|
"step": 15140 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 7.078892142047386, |
|
"learning_rate": 1.3776737008280734e-05, |
|
"loss": 0.1846, |
|
"step": 15160 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 4.833315926005269, |
|
"learning_rate": 1.3760253421181232e-05, |
|
"loss": 0.1955, |
|
"step": 15180 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 7.642880346774846, |
|
"learning_rate": 1.3743757927453485e-05, |
|
"loss": 0.1926, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 5.4079945125552, |
|
"learning_rate": 1.3727250579343427e-05, |
|
"loss": 0.1873, |
|
"step": 15220 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 5.948430330328783, |
|
"learning_rate": 1.371073142913454e-05, |
|
"loss": 0.207, |
|
"step": 15240 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 5.5464350136347615, |
|
"learning_rate": 1.369420052914769e-05, |
|
"loss": 0.2059, |
|
"step": 15260 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 6.914142224558012, |
|
"learning_rate": 1.3677657931740953e-05, |
|
"loss": 0.2101, |
|
"step": 15280 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 9.57042012056836, |
|
"learning_rate": 1.3661103689309451e-05, |
|
"loss": 0.1845, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 4.631446263817481, |
|
"learning_rate": 1.3644537854285198e-05, |
|
"loss": 0.1676, |
|
"step": 15320 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 7.563813907200117, |
|
"learning_rate": 1.3627960479136917e-05, |
|
"loss": 0.1959, |
|
"step": 15340 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 7.423882592374514, |
|
"learning_rate": 1.3611371616369888e-05, |
|
"loss": 0.2119, |
|
"step": 15360 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 8.033278865814227, |
|
"learning_rate": 1.3594771318525772e-05, |
|
"loss": 0.1999, |
|
"step": 15380 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 4.263461392179193, |
|
"learning_rate": 1.3578159638182443e-05, |
|
"loss": 0.1623, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 5.605534420891448, |
|
"learning_rate": 1.3561536627953846e-05, |
|
"loss": 0.1878, |
|
"step": 15420 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 7.271725167246008, |
|
"learning_rate": 1.3544902340489788e-05, |
|
"loss": 0.203, |
|
"step": 15440 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 24.17397448663477, |
|
"learning_rate": 1.3528256828475806e-05, |
|
"loss": 0.1996, |
|
"step": 15460 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 8.21922387907255, |
|
"learning_rate": 1.3511600144632984e-05, |
|
"loss": 0.2115, |
|
"step": 15480 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 6.438100546073929, |
|
"learning_rate": 1.3494932341717795e-05, |
|
"loss": 0.2178, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 5.438714078216004, |
|
"learning_rate": 1.3478253472521926e-05, |
|
"loss": 0.2035, |
|
"step": 15520 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 5.267123317638661, |
|
"learning_rate": 1.3461563589872115e-05, |
|
"loss": 0.1871, |
|
"step": 15540 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 6.952296244061062, |
|
"learning_rate": 1.3444862746629983e-05, |
|
"loss": 0.1796, |
|
"step": 15560 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 5.566630169564181, |
|
"learning_rate": 1.3428150995691864e-05, |
|
"loss": 0.19, |
|
"step": 15580 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 8.317946865138628, |
|
"learning_rate": 1.3411428389988643e-05, |
|
"loss": 0.1867, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 5.8312505364290015, |
|
"learning_rate": 1.3394694982485588e-05, |
|
"loss": 0.1966, |
|
"step": 15620 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 5.558038014921936, |
|
"learning_rate": 1.3377950826182167e-05, |
|
"loss": 0.2084, |
|
"step": 15640 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 3.3670931844933434, |
|
"learning_rate": 1.3361195974111908e-05, |
|
"loss": 0.1886, |
|
"step": 15660 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 6.730872285494118, |
|
"learning_rate": 1.3344430479342205e-05, |
|
"loss": 0.1991, |
|
"step": 15680 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 5.221936012884134, |
|
"learning_rate": 1.3327654394974164e-05, |
|
"loss": 0.1871, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 8.096789875414892, |
|
"learning_rate": 1.3310867774142433e-05, |
|
"loss": 0.1799, |
|
"step": 15720 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 4.9068121132509335, |
|
"learning_rate": 1.3294070670015026e-05, |
|
"loss": 0.1817, |
|
"step": 15740 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 3.9856004214295386, |
|
"learning_rate": 1.3277263135793167e-05, |
|
"loss": 0.1793, |
|
"step": 15760 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 7.118631561874846, |
|
"learning_rate": 1.3260445224711115e-05, |
|
"loss": 0.1787, |
|
"step": 15780 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 7.914428581966782, |
|
"learning_rate": 1.3243616990035988e-05, |
|
"loss": 0.1821, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 6.602254147384827, |
|
"learning_rate": 1.322677848506761e-05, |
|
"loss": 0.1746, |
|
"step": 15820 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 6.142691150589716, |
|
"learning_rate": 1.3209929763138333e-05, |
|
"loss": 0.1964, |
|
"step": 15840 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 7.318010910855917, |
|
"learning_rate": 1.3193070877612863e-05, |
|
"loss": 0.1974, |
|
"step": 15860 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 6.924430013948181, |
|
"learning_rate": 1.3176201881888104e-05, |
|
"loss": 0.1991, |
|
"step": 15880 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 7.883879242504948, |
|
"learning_rate": 1.3159322829392978e-05, |
|
"loss": 0.1924, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 5.220112148848683, |
|
"learning_rate": 1.3142433773588259e-05, |
|
"loss": 0.2138, |
|
"step": 15920 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 5.620528075363317, |
|
"learning_rate": 1.3125534767966406e-05, |
|
"loss": 0.1833, |
|
"step": 15940 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 7.394718512387162, |
|
"learning_rate": 1.3108625866051393e-05, |
|
"loss": 0.1745, |
|
"step": 15960 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 4.8000424144778275, |
|
"learning_rate": 1.3091707121398535e-05, |
|
"loss": 0.2024, |
|
"step": 15980 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 4.774826621857471, |
|
"learning_rate": 1.3074778587594328e-05, |
|
"loss": 0.2015, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 9.387391712768, |
|
"learning_rate": 1.3057840318256265e-05, |
|
"loss": 0.1795, |
|
"step": 16020 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 4.963107377940831, |
|
"learning_rate": 1.3040892367032682e-05, |
|
"loss": 0.1653, |
|
"step": 16040 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 6.997759324175277, |
|
"learning_rate": 1.3023934787602572e-05, |
|
"loss": 0.2063, |
|
"step": 16060 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 6.190583972900869, |
|
"learning_rate": 1.3006967633675432e-05, |
|
"loss": 0.2153, |
|
"step": 16080 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 6.824679624981862, |
|
"learning_rate": 1.2989990958991077e-05, |
|
"loss": 0.1891, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 8.754013884741005, |
|
"learning_rate": 1.2973004817319479e-05, |
|
"loss": 0.1804, |
|
"step": 16120 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 8.960491352232728, |
|
"learning_rate": 1.29560092624606e-05, |
|
"loss": 0.1923, |
|
"step": 16140 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 4.719281501394497, |
|
"learning_rate": 1.2939004348244207e-05, |
|
"loss": 0.2186, |
|
"step": 16160 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 6.0936889645380825, |
|
"learning_rate": 1.2921990128529713e-05, |
|
"loss": 0.2008, |
|
"step": 16180 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 6.559249218102312, |
|
"learning_rate": 1.2904966657206013e-05, |
|
"loss": 0.1968, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 7.485638410293939, |
|
"learning_rate": 1.2887933988191297e-05, |
|
"loss": 0.1754, |
|
"step": 16220 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 4.949399299007962, |
|
"learning_rate": 1.2870892175432887e-05, |
|
"loss": 0.1949, |
|
"step": 16240 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 5.031890259751463, |
|
"learning_rate": 1.2853841272907068e-05, |
|
"loss": 0.1697, |
|
"step": 16260 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 3.2773740128221367, |
|
"learning_rate": 1.2836781334618912e-05, |
|
"loss": 0.1706, |
|
"step": 16280 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 7.100024972791139, |
|
"learning_rate": 1.2819712414602112e-05, |
|
"loss": 0.1725, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 6.847007498771447, |
|
"learning_rate": 1.2802634566918806e-05, |
|
"loss": 0.2224, |
|
"step": 16320 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 18.124907394698774, |
|
"learning_rate": 1.2785547845659412e-05, |
|
"loss": 0.1954, |
|
"step": 16340 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 6.653915067377773, |
|
"learning_rate": 1.2768452304942449e-05, |
|
"loss": 0.195, |
|
"step": 16360 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 13.653829050105037, |
|
"learning_rate": 1.275134799891438e-05, |
|
"loss": 0.1771, |
|
"step": 16380 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 5.199783489414919, |
|
"learning_rate": 1.2734234981749416e-05, |
|
"loss": 0.1697, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 4.938204944360832, |
|
"learning_rate": 1.2717113307649367e-05, |
|
"loss": 0.2153, |
|
"step": 16420 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 5.379988728337729, |
|
"learning_rate": 1.2699983030843462e-05, |
|
"loss": 0.1807, |
|
"step": 16440 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 5.563035060565681, |
|
"learning_rate": 1.2682844205588175e-05, |
|
"loss": 0.1723, |
|
"step": 16460 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 4.137830028452154, |
|
"learning_rate": 1.2665696886167054e-05, |
|
"loss": 0.2015, |
|
"step": 16480 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 5.444985685023187, |
|
"learning_rate": 1.2648541126890553e-05, |
|
"loss": 0.1891, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 6.758562379366711, |
|
"learning_rate": 1.2631376982095857e-05, |
|
"loss": 0.1794, |
|
"step": 16520 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 6.42547461986865, |
|
"learning_rate": 1.2614204506146714e-05, |
|
"loss": 0.2072, |
|
"step": 16540 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 6.67478604205973, |
|
"learning_rate": 1.2597023753433248e-05, |
|
"loss": 0.1752, |
|
"step": 16560 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 5.114865214342277, |
|
"learning_rate": 1.2579834778371814e-05, |
|
"loss": 0.2129, |
|
"step": 16580 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 6.721543126268775, |
|
"learning_rate": 1.2562637635404791e-05, |
|
"loss": 0.1774, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 5.93001108203901, |
|
"learning_rate": 1.2545432379000448e-05, |
|
"loss": 0.1773, |
|
"step": 16620 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 5.7523308051939095, |
|
"learning_rate": 1.2528219063652729e-05, |
|
"loss": 0.2078, |
|
"step": 16640 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 6.871500685256494, |
|
"learning_rate": 1.2510997743881129e-05, |
|
"loss": 0.1804, |
|
"step": 16660 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 6.975118558142685, |
|
"learning_rate": 1.249376847423047e-05, |
|
"loss": 0.1923, |
|
"step": 16680 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 5.518168870477368, |
|
"learning_rate": 1.2476531309270773e-05, |
|
"loss": 0.2043, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 7.024733851268127, |
|
"learning_rate": 1.2459286303597055e-05, |
|
"loss": 0.1957, |
|
"step": 16720 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 5.189450368424653, |
|
"learning_rate": 1.244203351182917e-05, |
|
"loss": 0.1972, |
|
"step": 16740 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 4.769858359227226, |
|
"learning_rate": 1.2424772988611631e-05, |
|
"loss": 0.2045, |
|
"step": 16760 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 5.6345946816574575, |
|
"learning_rate": 1.2407504788613441e-05, |
|
"loss": 0.184, |
|
"step": 16780 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 8.03446545781803, |
|
"learning_rate": 1.2390228966527917e-05, |
|
"loss": 0.2016, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 5.376750572260963, |
|
"learning_rate": 1.2372945577072516e-05, |
|
"loss": 0.221, |
|
"step": 16820 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 3.9735936922072836, |
|
"learning_rate": 1.2355654674988669e-05, |
|
"loss": 0.2193, |
|
"step": 16840 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 4.966981585117317, |
|
"learning_rate": 1.2338356315041587e-05, |
|
"loss": 0.1788, |
|
"step": 16860 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 5.389331607177196, |
|
"learning_rate": 1.232105055202012e-05, |
|
"loss": 0.2325, |
|
"step": 16880 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 3.4531020772181753, |
|
"learning_rate": 1.2303737440736553e-05, |
|
"loss": 0.1978, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 8.524005426914416, |
|
"learning_rate": 1.2286417036026454e-05, |
|
"loss": 0.2219, |
|
"step": 16920 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 8.61532713797543, |
|
"learning_rate": 1.2269089392748484e-05, |
|
"loss": 0.1786, |
|
"step": 16940 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 7.510098080661287, |
|
"learning_rate": 1.225175456578423e-05, |
|
"loss": 0.192, |
|
"step": 16960 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 5.7571280705405075, |
|
"learning_rate": 1.2234412610038045e-05, |
|
"loss": 0.1884, |
|
"step": 16980 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 6.431467433327115, |
|
"learning_rate": 1.2217063580436841e-05, |
|
"loss": 0.1861, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 5.997538061494255, |
|
"learning_rate": 1.219970753192995e-05, |
|
"loss": 0.196, |
|
"step": 17020 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 5.041053272270793, |
|
"learning_rate": 1.218234451948893e-05, |
|
"loss": 0.1676, |
|
"step": 17040 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 8.26975958810288, |
|
"learning_rate": 1.2164974598107398e-05, |
|
"loss": 0.1953, |
|
"step": 17060 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 4.861930347014316, |
|
"learning_rate": 1.2147597822800843e-05, |
|
"loss": 0.2077, |
|
"step": 17080 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 6.4205005966339534, |
|
"learning_rate": 1.2130214248606478e-05, |
|
"loss": 0.1743, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 6.274607928060438, |
|
"learning_rate": 1.2112823930583042e-05, |
|
"loss": 0.168, |
|
"step": 17120 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 8.376715185548848, |
|
"learning_rate": 1.2095426923810631e-05, |
|
"loss": 0.1821, |
|
"step": 17140 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 8.347057130026942, |
|
"learning_rate": 1.2078023283390532e-05, |
|
"loss": 0.201, |
|
"step": 17160 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 6.10042452976507, |
|
"learning_rate": 1.2060613064445041e-05, |
|
"loss": 0.1864, |
|
"step": 17180 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 6.882586091178972, |
|
"learning_rate": 1.204319632211729e-05, |
|
"loss": 0.1945, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 4.831518270883375, |
|
"learning_rate": 1.2025773111571067e-05, |
|
"loss": 0.1997, |
|
"step": 17220 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 5.820557290888362, |
|
"learning_rate": 1.2008343487990652e-05, |
|
"loss": 0.213, |
|
"step": 17240 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 7.590766477620916, |
|
"learning_rate": 1.199090750658064e-05, |
|
"loss": 0.1943, |
|
"step": 17260 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 5.245303096541225, |
|
"learning_rate": 1.1973465222565756e-05, |
|
"loss": 0.1935, |
|
"step": 17280 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 5.635466946173672, |
|
"learning_rate": 1.1956016691190693e-05, |
|
"loss": 0.1937, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 9.8612439882403, |
|
"learning_rate": 1.1938561967719929e-05, |
|
"loss": 0.1998, |
|
"step": 17320 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 7.57010186594224, |
|
"learning_rate": 1.1921101107437547e-05, |
|
"loss": 0.1859, |
|
"step": 17340 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 5.578111322637294, |
|
"learning_rate": 1.190363416564708e-05, |
|
"loss": 0.1885, |
|
"step": 17360 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 10.636687907364392, |
|
"learning_rate": 1.188616119767132e-05, |
|
"loss": 0.2183, |
|
"step": 17380 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 5.149639168848235, |
|
"learning_rate": 1.1868682258852135e-05, |
|
"loss": 0.1854, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 7.37696372575026, |
|
"learning_rate": 1.1851197404550314e-05, |
|
"loss": 0.1859, |
|
"step": 17420 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 7.336027199681234, |
|
"learning_rate": 1.183370669014538e-05, |
|
"loss": 0.1685, |
|
"step": 17440 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 5.449592911101573, |
|
"learning_rate": 1.181621017103542e-05, |
|
"loss": 0.2028, |
|
"step": 17460 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 8.354625001227333, |
|
"learning_rate": 1.1798707902636895e-05, |
|
"loss": 0.1841, |
|
"step": 17480 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 6.278768324372431, |
|
"learning_rate": 1.178119994038449e-05, |
|
"loss": 0.1682, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 6.121827218539604, |
|
"learning_rate": 1.1763686339730911e-05, |
|
"loss": 0.1864, |
|
"step": 17520 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 4.642245950398283, |
|
"learning_rate": 1.174616715614673e-05, |
|
"loss": 0.1919, |
|
"step": 17540 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 12.026938401540649, |
|
"learning_rate": 1.1728642445120205e-05, |
|
"loss": 0.1876, |
|
"step": 17560 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 5.259501261110696, |
|
"learning_rate": 1.1711112262157093e-05, |
|
"loss": 0.196, |
|
"step": 17580 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 6.612093751827706, |
|
"learning_rate": 1.1693576662780486e-05, |
|
"loss": 0.1811, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 6.169870867251168, |
|
"learning_rate": 1.167603570253063e-05, |
|
"loss": 0.1955, |
|
"step": 17620 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 7.205754195375382, |
|
"learning_rate": 1.1658489436964753e-05, |
|
"loss": 0.1806, |
|
"step": 17640 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 7.67399813297936, |
|
"learning_rate": 1.1640937921656882e-05, |
|
"loss": 0.198, |
|
"step": 17660 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 5.102867345151343, |
|
"learning_rate": 1.1623381212197677e-05, |
|
"loss": 0.1663, |
|
"step": 17680 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 6.182588419638686, |
|
"learning_rate": 1.1605819364194244e-05, |
|
"loss": 0.1972, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 8.729593950859055, |
|
"learning_rate": 1.1588252433269966e-05, |
|
"loss": 0.1978, |
|
"step": 17720 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 4.896187735642024, |
|
"learning_rate": 1.1570680475064328e-05, |
|
"loss": 0.181, |
|
"step": 17740 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 8.586607472154492, |
|
"learning_rate": 1.1553103545232738e-05, |
|
"loss": 0.1778, |
|
"step": 17760 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 7.077719973666291, |
|
"learning_rate": 1.1535521699446344e-05, |
|
"loss": 0.1881, |
|
"step": 17780 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 6.368844445064559, |
|
"learning_rate": 1.151793499339187e-05, |
|
"loss": 0.1837, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 5.603581999541538, |
|
"learning_rate": 1.1500343482771433e-05, |
|
"loss": 0.1788, |
|
"step": 17820 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 6.124117106428645, |
|
"learning_rate": 1.1482747223302362e-05, |
|
"loss": 0.2073, |
|
"step": 17840 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 4.896094240852357, |
|
"learning_rate": 1.146514627071704e-05, |
|
"loss": 0.2018, |
|
"step": 17860 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 8.633063928041105, |
|
"learning_rate": 1.1447540680762697e-05, |
|
"loss": 0.187, |
|
"step": 17880 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 5.8650395795782595, |
|
"learning_rate": 1.1429930509201264e-05, |
|
"loss": 0.1884, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 3.3374318833847245, |
|
"learning_rate": 1.141231581180918e-05, |
|
"loss": 0.1755, |
|
"step": 17920 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 9.55305523159018, |
|
"learning_rate": 1.1394696644377216e-05, |
|
"loss": 0.183, |
|
"step": 17940 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 5.489288302847405, |
|
"learning_rate": 1.1377073062710309e-05, |
|
"loss": 0.1963, |
|
"step": 17960 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 5.8983191471983805, |
|
"learning_rate": 1.1359445122627362e-05, |
|
"loss": 0.1895, |
|
"step": 17980 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 7.189465859300051, |
|
"learning_rate": 1.1341812879961095e-05, |
|
"loss": 0.1673, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 7.447490489655281, |
|
"learning_rate": 1.1324176390557853e-05, |
|
"loss": 0.1809, |
|
"step": 18020 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 5.775175955597824, |
|
"learning_rate": 1.1306535710277428e-05, |
|
"loss": 0.1791, |
|
"step": 18040 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 8.2814761724716, |
|
"learning_rate": 1.1288890894992888e-05, |
|
"loss": 0.1819, |
|
"step": 18060 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 4.995617280837431, |
|
"learning_rate": 1.12712420005904e-05, |
|
"loss": 0.1925, |
|
"step": 18080 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 5.884038263005907, |
|
"learning_rate": 1.1253589082969046e-05, |
|
"loss": 0.1854, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 4.022777792168135, |
|
"learning_rate": 1.1235932198040653e-05, |
|
"loss": 0.1728, |
|
"step": 18120 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 4.792975515098119, |
|
"learning_rate": 1.1218271401729617e-05, |
|
"loss": 0.1836, |
|
"step": 18140 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 5.250400164657746, |
|
"learning_rate": 1.1200606749972718e-05, |
|
"loss": 0.167, |
|
"step": 18160 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 7.457426673152692, |
|
"learning_rate": 1.1182938298718945e-05, |
|
"loss": 0.1829, |
|
"step": 18180 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 14.206800331502505, |
|
"learning_rate": 1.1165266103929328e-05, |
|
"loss": 0.1778, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 6.502359695534185, |
|
"learning_rate": 1.1147590221576754e-05, |
|
"loss": 0.1799, |
|
"step": 18220 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 7.1040996052400525, |
|
"learning_rate": 1.1129910707645779e-05, |
|
"loss": 0.1917, |
|
"step": 18240 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 6.809597692047531, |
|
"learning_rate": 1.1112227618132472e-05, |
|
"loss": 0.1584, |
|
"step": 18260 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 6.214106220478112, |
|
"learning_rate": 1.1094541009044219e-05, |
|
"loss": 0.1745, |
|
"step": 18280 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 6.0818007467761, |
|
"learning_rate": 1.1076850936399564e-05, |
|
"loss": 0.1811, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 7.541164852188345, |
|
"learning_rate": 1.1059157456228008e-05, |
|
"loss": 0.1642, |
|
"step": 18320 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 8.065093340490257, |
|
"learning_rate": 1.104146062456986e-05, |
|
"loss": 0.1801, |
|
"step": 18340 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 6.675920675840688, |
|
"learning_rate": 1.1023760497476028e-05, |
|
"loss": 0.1756, |
|
"step": 18360 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 6.6987902445864655, |
|
"learning_rate": 1.1006057131007866e-05, |
|
"loss": 0.1795, |
|
"step": 18380 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 6.510299738533309, |
|
"learning_rate": 1.0988350581236991e-05, |
|
"loss": 0.1865, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 6.34510225061825, |
|
"learning_rate": 1.0970640904245094e-05, |
|
"loss": 0.1955, |
|
"step": 18420 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 4.228947726190241, |
|
"learning_rate": 1.0952928156123781e-05, |
|
"loss": 0.175, |
|
"step": 18440 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 6.238306015624761, |
|
"learning_rate": 1.0935212392974372e-05, |
|
"loss": 0.1767, |
|
"step": 18460 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 5.8825690027835424, |
|
"learning_rate": 1.0917493670907751e-05, |
|
"loss": 0.1804, |
|
"step": 18480 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 5.307188542805111, |
|
"learning_rate": 1.0899772046044157e-05, |
|
"loss": 0.2165, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 7.671752182857302, |
|
"learning_rate": 1.0882047574513045e-05, |
|
"loss": 0.1754, |
|
"step": 18520 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 7.228224840476562, |
|
"learning_rate": 1.0864320312452865e-05, |
|
"loss": 0.1749, |
|
"step": 18540 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 7.022023025670514, |
|
"learning_rate": 1.0846590316010918e-05, |
|
"loss": 0.1815, |
|
"step": 18560 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 11.59241800805326, |
|
"learning_rate": 1.082885764134316e-05, |
|
"loss": 0.1947, |
|
"step": 18580 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 5.496283576035116, |
|
"learning_rate": 1.081112234461403e-05, |
|
"loss": 0.191, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 8.37203471510935, |
|
"learning_rate": 1.0793384481996279e-05, |
|
"loss": 0.2098, |
|
"step": 18620 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 4.85467675057146, |
|
"learning_rate": 1.0775644109670778e-05, |
|
"loss": 0.199, |
|
"step": 18640 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 7.2927775672082165, |
|
"learning_rate": 1.0757901283826341e-05, |
|
"loss": 0.1763, |
|
"step": 18660 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 4.587898523966167, |
|
"learning_rate": 1.0740156060659565e-05, |
|
"loss": 0.1933, |
|
"step": 18680 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 4.846531420101418, |
|
"learning_rate": 1.0722408496374634e-05, |
|
"loss": 0.1605, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 6.357471377975472, |
|
"learning_rate": 1.0704658647183155e-05, |
|
"loss": 0.1911, |
|
"step": 18720 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 4.884941528032838, |
|
"learning_rate": 1.0686906569303955e-05, |
|
"loss": 0.1613, |
|
"step": 18740 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 6.578672698862572, |
|
"learning_rate": 1.0669152318962936e-05, |
|
"loss": 0.1583, |
|
"step": 18760 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 4.996568837666117, |
|
"learning_rate": 1.0651395952392876e-05, |
|
"loss": 0.1893, |
|
"step": 18780 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 6.137635879410934, |
|
"learning_rate": 1.0633637525833246e-05, |
|
"loss": 0.2006, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 10.339616116567784, |
|
"learning_rate": 1.0615877095530058e-05, |
|
"loss": 0.2034, |
|
"step": 18820 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 5.500147904228012, |
|
"learning_rate": 1.0598114717735661e-05, |
|
"loss": 0.1989, |
|
"step": 18840 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 5.340910932588536, |
|
"learning_rate": 1.0580350448708571e-05, |
|
"loss": 0.2044, |
|
"step": 18860 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 7.0020980415617275, |
|
"learning_rate": 1.0562584344713301e-05, |
|
"loss": 0.1873, |
|
"step": 18880 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 6.566893932198695, |
|
"learning_rate": 1.0544816462020169e-05, |
|
"loss": 0.1672, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 21.295579564416062, |
|
"learning_rate": 1.052704685690513e-05, |
|
"loss": 0.1728, |
|
"step": 18920 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 5.869527987425055, |
|
"learning_rate": 1.0509275585649594e-05, |
|
"loss": 0.2102, |
|
"step": 18940 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 15.699095798380197, |
|
"learning_rate": 1.0491502704540249e-05, |
|
"loss": 0.1861, |
|
"step": 18960 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 4.514889983973343, |
|
"learning_rate": 1.0473728269868879e-05, |
|
"loss": 0.189, |
|
"step": 18980 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 6.288009726033927, |
|
"learning_rate": 1.045595233793219e-05, |
|
"loss": 0.1626, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 5.491080420793532, |
|
"learning_rate": 1.0438174965031632e-05, |
|
"loss": 0.1763, |
|
"step": 19020 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 5.751789643793079, |
|
"learning_rate": 1.0420396207473214e-05, |
|
"loss": 0.1938, |
|
"step": 19040 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 6.8267007909359, |
|
"learning_rate": 1.0402616121567339e-05, |
|
"loss": 0.1965, |
|
"step": 19060 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 9.0429514544921, |
|
"learning_rate": 1.0384834763628609e-05, |
|
"loss": 0.1956, |
|
"step": 19080 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 4.883758892659512, |
|
"learning_rate": 1.0367052189975661e-05, |
|
"loss": 0.2052, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 4.958192369182968, |
|
"learning_rate": 1.0349268456930978e-05, |
|
"loss": 0.1595, |
|
"step": 19120 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 7.1048523990512775, |
|
"learning_rate": 1.0331483620820718e-05, |
|
"loss": 0.1802, |
|
"step": 19140 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 7.81188718299639, |
|
"learning_rate": 1.0313697737974532e-05, |
|
"loss": 0.1762, |
|
"step": 19160 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 7.41902623164456, |
|
"learning_rate": 1.0295910864725385e-05, |
|
"loss": 0.1815, |
|
"step": 19180 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 6.487508949464715, |
|
"learning_rate": 1.027812305740938e-05, |
|
"loss": 0.1868, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 6.2665205241809, |
|
"learning_rate": 1.0260334372365579e-05, |
|
"loss": 0.1786, |
|
"step": 19220 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 10.235739114720019, |
|
"learning_rate": 1.0242544865935822e-05, |
|
"loss": 0.1974, |
|
"step": 19240 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 7.9210272040028205, |
|
"learning_rate": 1.0224754594464548e-05, |
|
"loss": 0.1995, |
|
"step": 19260 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 7.599636480772786, |
|
"learning_rate": 1.020696361429863e-05, |
|
"loss": 0.1838, |
|
"step": 19280 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 6.6901787357374305, |
|
"learning_rate": 1.0189171981787176e-05, |
|
"loss": 0.1857, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 7.859622048956387, |
|
"learning_rate": 1.0171379753281365e-05, |
|
"loss": 0.1473, |
|
"step": 19320 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 9.023527968972886, |
|
"learning_rate": 1.015358698513426e-05, |
|
"loss": 0.2087, |
|
"step": 19340 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 7.332951564677237, |
|
"learning_rate": 1.0135793733700635e-05, |
|
"loss": 0.1723, |
|
"step": 19360 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 6.6064475500042406, |
|
"learning_rate": 1.0118000055336792e-05, |
|
"loss": 0.1892, |
|
"step": 19380 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 7.156751057104011, |
|
"learning_rate": 1.0100206006400388e-05, |
|
"loss": 0.1808, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 7.071734037861301, |
|
"learning_rate": 1.0082411643250256e-05, |
|
"loss": 0.1987, |
|
"step": 19420 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 5.576705794819434, |
|
"learning_rate": 1.0064617022246218e-05, |
|
"loss": 0.1826, |
|
"step": 19440 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 9.630333945880572, |
|
"learning_rate": 1.0046822199748918e-05, |
|
"loss": 0.1778, |
|
"step": 19460 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 10.401189633333978, |
|
"learning_rate": 1.0029027232119637e-05, |
|
"loss": 0.1834, |
|
"step": 19480 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 5.393374772794798, |
|
"learning_rate": 1.0011232175720113e-05, |
|
"loss": 0.1738, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 6.924075773515395, |
|
"learning_rate": 9.993437086912373e-06, |
|
"loss": 0.1917, |
|
"step": 19520 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 8.996239194302989, |
|
"learning_rate": 9.975642022058535e-06, |
|
"loss": 0.164, |
|
"step": 19540 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 6.440697625799741, |
|
"learning_rate": 9.95784703752065e-06, |
|
"loss": 0.1846, |
|
"step": 19560 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 4.738285879209919, |
|
"learning_rate": 9.940052189660508e-06, |
|
"loss": 0.2179, |
|
"step": 19580 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 6.078747244317922, |
|
"learning_rate": 9.922257534839473e-06, |
|
"loss": 0.1678, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 4.399307517265612, |
|
"learning_rate": 9.904463129418295e-06, |
|
"loss": 0.188, |
|
"step": 19620 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 6.3147663542594135, |
|
"learning_rate": 9.886669029756928e-06, |
|
"loss": 0.1814, |
|
"step": 19640 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 10.295074351756918, |
|
"learning_rate": 9.86887529221437e-06, |
|
"loss": 0.1594, |
|
"step": 19660 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 6.127518428057287, |
|
"learning_rate": 9.851081973148461e-06, |
|
"loss": 0.1583, |
|
"step": 19680 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 7.524745554123201, |
|
"learning_rate": 9.833289128915719e-06, |
|
"loss": 0.1725, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 8.809630447586713, |
|
"learning_rate": 9.815496815871163e-06, |
|
"loss": 0.1835, |
|
"step": 19720 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 5.0997599152545705, |
|
"learning_rate": 9.79770509036812e-06, |
|
"loss": 0.1918, |
|
"step": 19740 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 5.7257380913445415, |
|
"learning_rate": 9.779914008758064e-06, |
|
"loss": 0.179, |
|
"step": 19760 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 6.438675856373519, |
|
"learning_rate": 9.762123627390428e-06, |
|
"loss": 0.2072, |
|
"step": 19780 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 7.284638627304715, |
|
"learning_rate": 9.744334002612426e-06, |
|
"loss": 0.1655, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 7.624618520730721, |
|
"learning_rate": 9.726545190768871e-06, |
|
"loss": 0.1907, |
|
"step": 19820 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 5.640863218058795, |
|
"learning_rate": 9.70875724820201e-06, |
|
"loss": 0.1743, |
|
"step": 19840 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 6.508589424568692, |
|
"learning_rate": 9.690970231251332e-06, |
|
"loss": 0.1778, |
|
"step": 19860 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 4.719808699723677, |
|
"learning_rate": 9.673184196253397e-06, |
|
"loss": 0.1842, |
|
"step": 19880 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 5.327888908637108, |
|
"learning_rate": 9.655399199541648e-06, |
|
"loss": 0.1778, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 4.62310607898033, |
|
"learning_rate": 9.63761529744625e-06, |
|
"loss": 0.159, |
|
"step": 19920 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 4.242984591464857, |
|
"learning_rate": 9.61983254629389e-06, |
|
"loss": 0.1766, |
|
"step": 19940 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 4.634594013494804, |
|
"learning_rate": 9.60205100240762e-06, |
|
"loss": 0.186, |
|
"step": 19960 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 6.440683707201506, |
|
"learning_rate": 9.584270722106662e-06, |
|
"loss": 0.1856, |
|
"step": 19980 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 2.790764895496361, |
|
"learning_rate": 9.566491761706234e-06, |
|
"loss": 0.1841, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 7.195201164651666, |
|
"learning_rate": 9.54871417751738e-06, |
|
"loss": 0.1723, |
|
"step": 20020 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 4.764709985159199, |
|
"learning_rate": 9.530938025846778e-06, |
|
"loss": 0.1866, |
|
"step": 20040 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 5.932858550259948, |
|
"learning_rate": 9.513163362996577e-06, |
|
"loss": 0.1866, |
|
"step": 20060 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 6.464021142906132, |
|
"learning_rate": 9.495390245264204e-06, |
|
"loss": 0.1868, |
|
"step": 20080 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 5.655476722759092, |
|
"learning_rate": 9.477618728942194e-06, |
|
"loss": 0.166, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 5.962336904666332, |
|
"learning_rate": 9.459848870318007e-06, |
|
"loss": 0.2101, |
|
"step": 20120 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 4.675684775725181, |
|
"learning_rate": 9.44208072567386e-06, |
|
"loss": 0.1772, |
|
"step": 20140 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 7.450883124319922, |
|
"learning_rate": 9.42431435128654e-06, |
|
"loss": 0.1647, |
|
"step": 20160 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 4.965583223967794, |
|
"learning_rate": 9.406549803427218e-06, |
|
"loss": 0.2103, |
|
"step": 20180 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 6.079517195254289, |
|
"learning_rate": 9.388787138361289e-06, |
|
"loss": 0.1917, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 5.5400797739723515, |
|
"learning_rate": 9.371026412348178e-06, |
|
"loss": 0.1691, |
|
"step": 20220 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 5.974428361817412, |
|
"learning_rate": 9.353267681641178e-06, |
|
"loss": 0.1887, |
|
"step": 20240 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 5.87274126501588, |
|
"learning_rate": 9.335511002487252e-06, |
|
"loss": 0.1888, |
|
"step": 20260 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 5.03986866734462, |
|
"learning_rate": 9.31775643112687e-06, |
|
"loss": 0.1793, |
|
"step": 20280 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 4.961609321820686, |
|
"learning_rate": 9.300004023793826e-06, |
|
"loss": 0.1811, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 6.09235980486571, |
|
"learning_rate": 9.282253836715063e-06, |
|
"loss": 0.1699, |
|
"step": 20320 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 6.186755202964333, |
|
"learning_rate": 9.264505926110482e-06, |
|
"loss": 0.1936, |
|
"step": 20340 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 6.795058856229219, |
|
"learning_rate": 9.246760348192785e-06, |
|
"loss": 0.1988, |
|
"step": 20360 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 8.626010116914388, |
|
"learning_rate": 9.229017159167278e-06, |
|
"loss": 0.1753, |
|
"step": 20380 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 7.036939700979724, |
|
"learning_rate": 9.211276415231704e-06, |
|
"loss": 0.1775, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 7.222151578247956, |
|
"learning_rate": 9.193538172576061e-06, |
|
"loss": 0.2063, |
|
"step": 20420 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 6.261687984855021, |
|
"learning_rate": 9.175802487382427e-06, |
|
"loss": 0.1875, |
|
"step": 20440 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 6.763660595031022, |
|
"learning_rate": 9.158069415824776e-06, |
|
"loss": 0.162, |
|
"step": 20460 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 6.645406711322244, |
|
"learning_rate": 9.140339014068805e-06, |
|
"loss": 0.1701, |
|
"step": 20480 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 6.075259280852945, |
|
"learning_rate": 9.122611338271759e-06, |
|
"loss": 0.1876, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 5.071793679309942, |
|
"learning_rate": 9.104886444582239e-06, |
|
"loss": 0.1891, |
|
"step": 20520 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 6.0993248112456, |
|
"learning_rate": 9.087164389140048e-06, |
|
"loss": 0.1773, |
|
"step": 20540 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 3.137955931101823, |
|
"learning_rate": 9.069445228075984e-06, |
|
"loss": 0.175, |
|
"step": 20560 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 6.87017680822122, |
|
"learning_rate": 9.051729017511696e-06, |
|
"loss": 0.1781, |
|
"step": 20580 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 6.740478016007273, |
|
"learning_rate": 9.034015813559472e-06, |
|
"loss": 0.1842, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 6.293414885644483, |
|
"learning_rate": 9.016305672322082e-06, |
|
"loss": 0.1754, |
|
"step": 20620 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 6.770107414261222, |
|
"learning_rate": 8.998598649892602e-06, |
|
"loss": 0.1832, |
|
"step": 20640 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 6.957667275109727, |
|
"learning_rate": 8.98089480235422e-06, |
|
"loss": 0.1915, |
|
"step": 20660 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 7.753279699575756, |
|
"learning_rate": 8.963194185780076e-06, |
|
"loss": 0.2074, |
|
"step": 20680 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 5.572695528675712, |
|
"learning_rate": 8.94549685623307e-06, |
|
"loss": 0.1675, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 8.318273389362469, |
|
"learning_rate": 8.927802869765697e-06, |
|
"loss": 0.1901, |
|
"step": 20720 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 6.1756703742616645, |
|
"learning_rate": 8.91011228241986e-06, |
|
"loss": 0.1759, |
|
"step": 20740 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 4.72678645840794, |
|
"learning_rate": 8.892425150226697e-06, |
|
"loss": 0.1672, |
|
"step": 20760 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 5.064164253114183, |
|
"learning_rate": 8.874741529206401e-06, |
|
"loss": 0.1832, |
|
"step": 20780 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 5.593557173319701, |
|
"learning_rate": 8.857061475368046e-06, |
|
"loss": 0.1767, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 5.739827985678297, |
|
"learning_rate": 8.83938504470941e-06, |
|
"loss": 0.1633, |
|
"step": 20820 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 3.8745235027093172, |
|
"learning_rate": 8.821712293216792e-06, |
|
"loss": 0.1827, |
|
"step": 20840 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 5.758169590139612, |
|
"learning_rate": 8.804043276864838e-06, |
|
"loss": 0.1799, |
|
"step": 20860 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 9.048806835785985, |
|
"learning_rate": 8.786378051616363e-06, |
|
"loss": 0.1818, |
|
"step": 20880 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 4.6191342246916545, |
|
"learning_rate": 8.768716673422176e-06, |
|
"loss": 0.184, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 10.64178899596623, |
|
"learning_rate": 8.751059198220903e-06, |
|
"loss": 0.1868, |
|
"step": 20920 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 5.957903048399736, |
|
"learning_rate": 8.733405681938806e-06, |
|
"loss": 0.2088, |
|
"step": 20940 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 7.14845190828466, |
|
"learning_rate": 8.715756180489609e-06, |
|
"loss": 0.1591, |
|
"step": 20960 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 5.026492884909507, |
|
"learning_rate": 8.698110749774315e-06, |
|
"loss": 0.1692, |
|
"step": 20980 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 6.193357527199346, |
|
"learning_rate": 8.680469445681042e-06, |
|
"loss": 0.1865, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 5.392241393190001, |
|
"learning_rate": 8.662832324084831e-06, |
|
"loss": 0.1643, |
|
"step": 21020 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 7.209071685667427, |
|
"learning_rate": 8.645199440847485e-06, |
|
"loss": 0.1699, |
|
"step": 21040 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 6.378055513443605, |
|
"learning_rate": 8.62757085181737e-06, |
|
"loss": 0.1997, |
|
"step": 21060 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 5.948983100737581, |
|
"learning_rate": 8.609946612829258e-06, |
|
"loss": 0.1768, |
|
"step": 21080 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 5.843305438262167, |
|
"learning_rate": 8.592326779704148e-06, |
|
"loss": 0.1819, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 4.695464831529554, |
|
"learning_rate": 8.574711408249074e-06, |
|
"loss": 0.1984, |
|
"step": 21120 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 5.334180148766731, |
|
"learning_rate": 8.557100554256944e-06, |
|
"loss": 0.18, |
|
"step": 21140 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 3.594041232042909, |
|
"learning_rate": 8.53949427350636e-06, |
|
"loss": 0.165, |
|
"step": 21160 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 5.690661212066448, |
|
"learning_rate": 8.521892621761433e-06, |
|
"loss": 0.2051, |
|
"step": 21180 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 8.263658257123618, |
|
"learning_rate": 8.504295654771622e-06, |
|
"loss": 0.178, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 6.964730099763294, |
|
"learning_rate": 8.486703428271536e-06, |
|
"loss": 0.1718, |
|
"step": 21220 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 7.2364322382017745, |
|
"learning_rate": 8.469115997980786e-06, |
|
"loss": 0.1609, |
|
"step": 21240 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 7.01112598686971, |
|
"learning_rate": 8.451533419603773e-06, |
|
"loss": 0.1918, |
|
"step": 21260 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 6.7742526609944385, |
|
"learning_rate": 8.433955748829543e-06, |
|
"loss": 0.1746, |
|
"step": 21280 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 6.080225878083447, |
|
"learning_rate": 8.416383041331594e-06, |
|
"loss": 0.1621, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 6.122578979331691, |
|
"learning_rate": 8.398815352767706e-06, |
|
"loss": 0.1866, |
|
"step": 21320 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 9.715953363199073, |
|
"learning_rate": 8.38125273877976e-06, |
|
"loss": 0.1696, |
|
"step": 21340 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 6.632818538811297, |
|
"learning_rate": 8.363695254993569e-06, |
|
"loss": 0.182, |
|
"step": 21360 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 5.18153129890793, |
|
"learning_rate": 8.346142957018688e-06, |
|
"loss": 0.2091, |
|
"step": 21380 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 4.933940908460862, |
|
"learning_rate": 8.32859590044826e-06, |
|
"loss": 0.1834, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 4.130527376581761, |
|
"learning_rate": 8.311054140858814e-06, |
|
"loss": 0.217, |
|
"step": 21420 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 4.51550356163752, |
|
"learning_rate": 8.29351773381011e-06, |
|
"loss": 0.2001, |
|
"step": 21440 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 5.869039230613348, |
|
"learning_rate": 8.275986734844956e-06, |
|
"loss": 0.176, |
|
"step": 21460 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 5.039303039014279, |
|
"learning_rate": 8.258461199489026e-06, |
|
"loss": 0.2202, |
|
"step": 21480 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 6.5768380616493936, |
|
"learning_rate": 8.240941183250689e-06, |
|
"loss": 0.1748, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 5.013498586372746, |
|
"learning_rate": 8.22342674162084e-06, |
|
"loss": 0.1933, |
|
"step": 21520 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 4.181027548764225, |
|
"learning_rate": 8.205917930072707e-06, |
|
"loss": 0.1706, |
|
"step": 21540 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 6.782829698366385, |
|
"learning_rate": 8.188414804061698e-06, |
|
"loss": 0.1857, |
|
"step": 21560 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 13.591366754088444, |
|
"learning_rate": 8.170917419025203e-06, |
|
"loss": 0.1467, |
|
"step": 21580 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 6.120949476303091, |
|
"learning_rate": 8.153425830382438e-06, |
|
"loss": 0.1991, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 7.272557124401674, |
|
"learning_rate": 8.135940093534249e-06, |
|
"loss": 0.1766, |
|
"step": 21620 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 6.349382878150412, |
|
"learning_rate": 8.11846026386296e-06, |
|
"loss": 0.1989, |
|
"step": 21640 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 3.9867833656488356, |
|
"learning_rate": 8.100986396732173e-06, |
|
"loss": 0.1831, |
|
"step": 21660 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 5.4985809229416365, |
|
"learning_rate": 8.083518547486617e-06, |
|
"loss": 0.1851, |
|
"step": 21680 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 5.519318033191571, |
|
"learning_rate": 8.066056771451954e-06, |
|
"loss": 0.1879, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 3.14973956651229, |
|
"learning_rate": 8.048601123934609e-06, |
|
"loss": 0.1737, |
|
"step": 21720 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 4.004131091247943, |
|
"learning_rate": 8.031151660221597e-06, |
|
"loss": 0.1667, |
|
"step": 21740 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 6.567536955091622, |
|
"learning_rate": 8.013708435580352e-06, |
|
"loss": 0.1697, |
|
"step": 21760 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 4.599731495525866, |
|
"learning_rate": 7.996271505258542e-06, |
|
"loss": 0.1547, |
|
"step": 21780 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 6.65599496558806, |
|
"learning_rate": 7.978840924483904e-06, |
|
"loss": 0.1774, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 8.560171882828884, |
|
"learning_rate": 7.961416748464055e-06, |
|
"loss": 0.2049, |
|
"step": 21820 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 6.954024696005634, |
|
"learning_rate": 7.943999032386336e-06, |
|
"loss": 0.1881, |
|
"step": 21840 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 4.630461689811874, |
|
"learning_rate": 7.926587831417623e-06, |
|
"loss": 0.1881, |
|
"step": 21860 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 7.232546878139292, |
|
"learning_rate": 7.90918320070416e-06, |
|
"loss": 0.1995, |
|
"step": 21880 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 7.453903957263037, |
|
"learning_rate": 7.891785195371375e-06, |
|
"loss": 0.1722, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 7.759776194383061, |
|
"learning_rate": 7.874393870523715e-06, |
|
"loss": 0.1695, |
|
"step": 21920 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 10.868257032537139, |
|
"learning_rate": 7.857009281244472e-06, |
|
"loss": 0.1835, |
|
"step": 21940 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 5.779984289233758, |
|
"learning_rate": 7.839631482595597e-06, |
|
"loss": 0.1665, |
|
"step": 21960 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 5.114836127824816, |
|
"learning_rate": 7.822260529617539e-06, |
|
"loss": 0.1882, |
|
"step": 21980 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 5.8070528491627105, |
|
"learning_rate": 7.804896477329062e-06, |
|
"loss": 0.2043, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 3.7936436595812935, |
|
"learning_rate": 7.787539380727074e-06, |
|
"loss": 0.1828, |
|
"step": 22020 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 7.004553267660414, |
|
"learning_rate": 7.770189294786455e-06, |
|
"loss": 0.1891, |
|
"step": 22040 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 7.107618580250647, |
|
"learning_rate": 7.752846274459873e-06, |
|
"loss": 0.1952, |
|
"step": 22060 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 7.293012687334171, |
|
"learning_rate": 7.735510374677624e-06, |
|
"loss": 0.1668, |
|
"step": 22080 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 5.229665339832979, |
|
"learning_rate": 7.718181650347453e-06, |
|
"loss": 0.2154, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 6.524237121183421, |
|
"learning_rate": 7.70086015635437e-06, |
|
"loss": 0.1834, |
|
"step": 22120 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 7.363125050150268, |
|
"learning_rate": 7.683545947560491e-06, |
|
"loss": 0.1865, |
|
"step": 22140 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 7.300432730612564, |
|
"learning_rate": 7.666239078804853e-06, |
|
"loss": 0.1818, |
|
"step": 22160 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 14.035100259942592, |
|
"learning_rate": 7.648939604903252e-06, |
|
"loss": 0.191, |
|
"step": 22180 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 6.910011337425548, |
|
"learning_rate": 7.631647580648057e-06, |
|
"loss": 0.168, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 6.7568942995248324, |
|
"learning_rate": 7.6143630608080395e-06, |
|
"loss": 0.1843, |
|
"step": 22220 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 6.747067132204526, |
|
"learning_rate": 7.597086100128209e-06, |
|
"loss": 0.1937, |
|
"step": 22240 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 8.5527154190958, |
|
"learning_rate": 7.579816753329629e-06, |
|
"loss": 0.1818, |
|
"step": 22260 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 7.746379621946916, |
|
"learning_rate": 7.562555075109248e-06, |
|
"loss": 0.2052, |
|
"step": 22280 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 6.639988193271665, |
|
"learning_rate": 7.545301120139724e-06, |
|
"loss": 0.1631, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 6.767819936156134, |
|
"learning_rate": 7.528054943069261e-06, |
|
"loss": 0.1661, |
|
"step": 22320 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 5.273647818433291, |
|
"learning_rate": 7.510816598521416e-06, |
|
"loss": 0.1584, |
|
"step": 22340 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 8.93563631273546, |
|
"learning_rate": 7.493586141094952e-06, |
|
"loss": 0.1555, |
|
"step": 22360 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 3.0753266447920566, |
|
"learning_rate": 7.47636362536364e-06, |
|
"loss": 0.1517, |
|
"step": 22380 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 10.30705792722804, |
|
"learning_rate": 7.459149105876106e-06, |
|
"loss": 0.154, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 7.984708476707701, |
|
"learning_rate": 7.441942637155638e-06, |
|
"loss": 0.1671, |
|
"step": 22420 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 5.866654657582794, |
|
"learning_rate": 7.424744273700038e-06, |
|
"loss": 0.1886, |
|
"step": 22440 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 4.737602317329208, |
|
"learning_rate": 7.407554069981428e-06, |
|
"loss": 0.2059, |
|
"step": 22460 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 4.877816193781035, |
|
"learning_rate": 7.390372080446089e-06, |
|
"loss": 0.198, |
|
"step": 22480 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 4.465809741780349, |
|
"learning_rate": 7.373198359514283e-06, |
|
"loss": 0.1678, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 6.934781753417284, |
|
"learning_rate": 7.356032961580083e-06, |
|
"loss": 0.18, |
|
"step": 22520 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 6.355881689873095, |
|
"learning_rate": 7.338875941011206e-06, |
|
"loss": 0.1676, |
|
"step": 22540 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 6.110876810996104, |
|
"learning_rate": 7.321727352148833e-06, |
|
"loss": 0.1855, |
|
"step": 22560 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 5.355037257245673, |
|
"learning_rate": 7.304587249307434e-06, |
|
"loss": 0.1804, |
|
"step": 22580 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 6.6548543550416195, |
|
"learning_rate": 7.287455686774608e-06, |
|
"loss": 0.2034, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 9.27569310973663, |
|
"learning_rate": 7.270332718810901e-06, |
|
"loss": 0.1937, |
|
"step": 22620 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 4.751357838150305, |
|
"learning_rate": 7.253218399649638e-06, |
|
"loss": 0.1651, |
|
"step": 22640 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 9.734100288935727, |
|
"learning_rate": 7.2361127834967505e-06, |
|
"loss": 0.1529, |
|
"step": 22660 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 8.120082834507159, |
|
"learning_rate": 7.219015924530608e-06, |
|
"loss": 0.1747, |
|
"step": 22680 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 5.407215715714017, |
|
"learning_rate": 7.201927876901839e-06, |
|
"loss": 0.1704, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 5.774551387388109, |
|
"learning_rate": 7.184848694733164e-06, |
|
"loss": 0.161, |
|
"step": 22720 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 6.759114928231145, |
|
"learning_rate": 7.167778432119233e-06, |
|
"loss": 0.1879, |
|
"step": 22740 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 4.92199338740308, |
|
"learning_rate": 7.150717143126433e-06, |
|
"loss": 0.1652, |
|
"step": 22760 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 4.178823107172061, |
|
"learning_rate": 7.133664881792739e-06, |
|
"loss": 0.1785, |
|
"step": 22780 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 6.686026437573051, |
|
"learning_rate": 7.116621702127524e-06, |
|
"loss": 0.1869, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 4.2403070205517, |
|
"learning_rate": 7.099587658111403e-06, |
|
"loss": 0.1673, |
|
"step": 22820 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 3.965564681031053, |
|
"learning_rate": 7.082562803696054e-06, |
|
"loss": 0.1606, |
|
"step": 22840 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 7.1312004014078205, |
|
"learning_rate": 7.065547192804044e-06, |
|
"loss": 0.1833, |
|
"step": 22860 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 6.361412078237849, |
|
"learning_rate": 7.048540879328677e-06, |
|
"loss": 0.176, |
|
"step": 22880 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 7.594302059247436, |
|
"learning_rate": 7.031543917133794e-06, |
|
"loss": 0.1622, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 10.096625612296556, |
|
"learning_rate": 7.014556360053627e-06, |
|
"loss": 0.1875, |
|
"step": 22920 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 5.429860679005328, |
|
"learning_rate": 6.997578261892612e-06, |
|
"loss": 0.1742, |
|
"step": 22940 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 4.64963548113418, |
|
"learning_rate": 6.980609676425238e-06, |
|
"loss": 0.1645, |
|
"step": 22960 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 7.328249932303926, |
|
"learning_rate": 6.963650657395851e-06, |
|
"loss": 0.1653, |
|
"step": 22980 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 4.797310178817107, |
|
"learning_rate": 6.946701258518505e-06, |
|
"loss": 0.1718, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 5.93397915500725, |
|
"learning_rate": 6.929761533476782e-06, |
|
"loss": 0.171, |
|
"step": 23020 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 4.493879394366119, |
|
"learning_rate": 6.912831535923627e-06, |
|
"loss": 0.1596, |
|
"step": 23040 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 4.5526438818226564, |
|
"learning_rate": 6.89591131948117e-06, |
|
"loss": 0.1477, |
|
"step": 23060 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 5.57770560261374, |
|
"learning_rate": 6.879000937740566e-06, |
|
"loss": 0.1911, |
|
"step": 23080 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 5.825009574391105, |
|
"learning_rate": 6.862100444261819e-06, |
|
"loss": 0.1768, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 5.136029725948361, |
|
"learning_rate": 6.845209892573611e-06, |
|
"loss": 0.1863, |
|
"step": 23120 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 7.989879882826286, |
|
"learning_rate": 6.828329336173145e-06, |
|
"loss": 0.1763, |
|
"step": 23140 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 6.65685803139121, |
|
"learning_rate": 6.8114588285259576e-06, |
|
"loss": 0.1755, |
|
"step": 23160 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 9.26693783187225, |
|
"learning_rate": 6.794598423065758e-06, |
|
"loss": 0.176, |
|
"step": 23180 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 3.574898543618821, |
|
"learning_rate": 6.7777481731942616e-06, |
|
"loss": 0.1858, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 4.831086963108529, |
|
"learning_rate": 6.760908132281021e-06, |
|
"loss": 0.1796, |
|
"step": 23220 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 7.68773272402437, |
|
"learning_rate": 6.744078353663247e-06, |
|
"loss": 0.1703, |
|
"step": 23240 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 9.370545816880146, |
|
"learning_rate": 6.727258890645652e-06, |
|
"loss": 0.18, |
|
"step": 23260 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 9.159176720015646, |
|
"learning_rate": 6.710449796500274e-06, |
|
"loss": 0.1716, |
|
"step": 23280 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 5.362130304855524, |
|
"learning_rate": 6.693651124466311e-06, |
|
"loss": 0.168, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 4.870253771176025, |
|
"learning_rate": 6.676862927749953e-06, |
|
"loss": 0.2008, |
|
"step": 23320 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 4.029937293840335, |
|
"learning_rate": 6.6600852595242075e-06, |
|
"loss": 0.1735, |
|
"step": 23340 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 5.6204264379241025, |
|
"learning_rate": 6.643318172928737e-06, |
|
"loss": 0.1707, |
|
"step": 23360 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 9.703306236355932, |
|
"learning_rate": 6.626561721069688e-06, |
|
"loss": 0.1599, |
|
"step": 23380 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 7.347162760458088, |
|
"learning_rate": 6.609815957019527e-06, |
|
"loss": 0.1703, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 12.113550817300446, |
|
"learning_rate": 6.593080933816866e-06, |
|
"loss": 0.1784, |
|
"step": 23420 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 3.3783362111733486, |
|
"learning_rate": 6.576356704466297e-06, |
|
"loss": 0.1641, |
|
"step": 23440 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 4.068637613130167, |
|
"learning_rate": 6.5596433219382285e-06, |
|
"loss": 0.1436, |
|
"step": 23460 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 6.015225800979343, |
|
"learning_rate": 6.542940839168712e-06, |
|
"loss": 0.1975, |
|
"step": 23480 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 7.2342521376918585, |
|
"learning_rate": 6.5262493090592715e-06, |
|
"loss": 0.1882, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 7.619431760731591, |
|
"learning_rate": 6.509568784476753e-06, |
|
"loss": 0.1743, |
|
"step": 23520 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 4.213072170259656, |
|
"learning_rate": 6.4928993182531345e-06, |
|
"loss": 0.1576, |
|
"step": 23540 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 8.226101564425411, |
|
"learning_rate": 6.476240963185369e-06, |
|
"loss": 0.1565, |
|
"step": 23560 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 8.224634189065512, |
|
"learning_rate": 6.459593772035225e-06, |
|
"loss": 0.1835, |
|
"step": 23580 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 6.247185035023087, |
|
"learning_rate": 6.442957797529104e-06, |
|
"loss": 0.1736, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 5.9810785123502965, |
|
"learning_rate": 6.426333092357886e-06, |
|
"loss": 0.1615, |
|
"step": 23620 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 11.84481798656128, |
|
"learning_rate": 6.409719709176755e-06, |
|
"loss": 0.1888, |
|
"step": 23640 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 7.506850102533956, |
|
"learning_rate": 6.393117700605034e-06, |
|
"loss": 0.1963, |
|
"step": 23660 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 7.776460084859693, |
|
"learning_rate": 6.376527119226023e-06, |
|
"loss": 0.1485, |
|
"step": 23680 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 6.3180091914164125, |
|
"learning_rate": 6.359948017586827e-06, |
|
"loss": 0.1816, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 11.20267927275362, |
|
"learning_rate": 6.343380448198188e-06, |
|
"loss": 0.1652, |
|
"step": 23720 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 6.037469216973155, |
|
"learning_rate": 6.326824463534336e-06, |
|
"loss": 0.1725, |
|
"step": 23740 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 6.261846682645075, |
|
"learning_rate": 6.310280116032791e-06, |
|
"loss": 0.1538, |
|
"step": 23760 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 8.59123942376336, |
|
"learning_rate": 6.293747458094223e-06, |
|
"loss": 0.1737, |
|
"step": 23780 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 10.500292092756426, |
|
"learning_rate": 6.277226542082278e-06, |
|
"loss": 0.1921, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 4.477032435385099, |
|
"learning_rate": 6.260717420323409e-06, |
|
"loss": 0.1721, |
|
"step": 23820 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 5.503172566318245, |
|
"learning_rate": 6.244220145106716e-06, |
|
"loss": 0.1668, |
|
"step": 23840 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 13.937620974986471, |
|
"learning_rate": 6.227734768683779e-06, |
|
"loss": 0.1721, |
|
"step": 23860 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 4.6367027070331135, |
|
"learning_rate": 6.211261343268485e-06, |
|
"loss": 0.1765, |
|
"step": 23880 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 5.804661212928365, |
|
"learning_rate": 6.194799921036879e-06, |
|
"loss": 0.1706, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 5.838975230670599, |
|
"learning_rate": 6.178350554126979e-06, |
|
"loss": 0.1684, |
|
"step": 23920 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 6.638228830097108, |
|
"learning_rate": 6.161913294638621e-06, |
|
"loss": 0.1848, |
|
"step": 23940 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 5.656968213422785, |
|
"learning_rate": 6.1454881946333e-06, |
|
"loss": 0.1674, |
|
"step": 23960 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 10.46109722035461, |
|
"learning_rate": 6.1290753061339925e-06, |
|
"loss": 0.1631, |
|
"step": 23980 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 7.0280095071831425, |
|
"learning_rate": 6.112674681124998e-06, |
|
"loss": 0.1759, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 7.401073079016803, |
|
"learning_rate": 6.09628637155178e-06, |
|
"loss": 0.185, |
|
"step": 24020 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 7.45583155210073, |
|
"learning_rate": 6.079910429320789e-06, |
|
"loss": 0.1907, |
|
"step": 24040 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 5.361896066514213, |
|
"learning_rate": 6.063546906299304e-06, |
|
"loss": 0.1661, |
|
"step": 24060 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 4.390297656906118, |
|
"learning_rate": 6.047195854315274e-06, |
|
"loss": 0.161, |
|
"step": 24080 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 7.3569152997891365, |
|
"learning_rate": 6.030857325157148e-06, |
|
"loss": 0.183, |
|
"step": 24100 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 9.276295993466274, |
|
"learning_rate": 6.014531370573706e-06, |
|
"loss": 0.1585, |
|
"step": 24120 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 6.708581228789846, |
|
"learning_rate": 5.99821804227391e-06, |
|
"loss": 0.1923, |
|
"step": 24140 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 4.565214003093718, |
|
"learning_rate": 5.981917391926716e-06, |
|
"loss": 0.1618, |
|
"step": 24160 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 4.958953220289099, |
|
"learning_rate": 5.9656294711609455e-06, |
|
"loss": 0.1766, |
|
"step": 24180 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 8.765162382650892, |
|
"learning_rate": 5.949354331565087e-06, |
|
"loss": 0.179, |
|
"step": 24200 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 9.338864277112508, |
|
"learning_rate": 5.93309202468715e-06, |
|
"loss": 0.1772, |
|
"step": 24220 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 6.455990799227259, |
|
"learning_rate": 5.916842602034503e-06, |
|
"loss": 0.1764, |
|
"step": 24240 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 5.02536557514161, |
|
"learning_rate": 5.900606115073703e-06, |
|
"loss": 0.1834, |
|
"step": 24260 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 5.6042058818064335, |
|
"learning_rate": 5.884382615230334e-06, |
|
"loss": 0.1667, |
|
"step": 24280 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 3.5189123085762195, |
|
"learning_rate": 5.8681721538888544e-06, |
|
"loss": 0.1572, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 5.6992692847099855, |
|
"learning_rate": 5.85197478239242e-06, |
|
"loss": 0.1953, |
|
"step": 24320 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 7.984601221033869, |
|
"learning_rate": 5.835790552042726e-06, |
|
"loss": 0.1821, |
|
"step": 24340 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 5.326599797739027, |
|
"learning_rate": 5.819619514099847e-06, |
|
"loss": 0.1899, |
|
"step": 24360 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 5.785676158799588, |
|
"learning_rate": 5.80346171978208e-06, |
|
"loss": 0.1655, |
|
"step": 24380 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 5.66486255125483, |
|
"learning_rate": 5.78731722026576e-06, |
|
"loss": 0.1787, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 3.0399399302219625, |
|
"learning_rate": 5.771186066685136e-06, |
|
"loss": 0.1913, |
|
"step": 24420 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 4.538207717223615, |
|
"learning_rate": 5.755068310132162e-06, |
|
"loss": 0.1486, |
|
"step": 24440 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 8.673712678315818, |
|
"learning_rate": 5.738964001656382e-06, |
|
"loss": 0.1561, |
|
"step": 24460 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 3.9322192187509035, |
|
"learning_rate": 5.722873192264731e-06, |
|
"loss": 0.1594, |
|
"step": 24480 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 5.362419487053112, |
|
"learning_rate": 5.706795932921395e-06, |
|
"loss": 0.1769, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 5.243766171745307, |
|
"learning_rate": 5.690732274547639e-06, |
|
"loss": 0.1674, |
|
"step": 24520 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 6.49132016995219, |
|
"learning_rate": 5.674682268021655e-06, |
|
"loss": 0.1795, |
|
"step": 24540 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 8.23819975362725, |
|
"learning_rate": 5.658645964178398e-06, |
|
"loss": 0.1739, |
|
"step": 24560 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 6.291038363918079, |
|
"learning_rate": 5.642623413809408e-06, |
|
"loss": 0.1574, |
|
"step": 24580 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 5.366637178107015, |
|
"learning_rate": 5.626614667662681e-06, |
|
"loss": 0.1694, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 9.263601664358115, |
|
"learning_rate": 5.610619776442482e-06, |
|
"loss": 0.1928, |
|
"step": 24620 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 6.177003530809166, |
|
"learning_rate": 5.5946387908091995e-06, |
|
"loss": 0.1578, |
|
"step": 24640 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 4.122671452664669, |
|
"learning_rate": 5.5786717613791675e-06, |
|
"loss": 0.1652, |
|
"step": 24660 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 8.598868326647608, |
|
"learning_rate": 5.562718738724532e-06, |
|
"loss": 0.1829, |
|
"step": 24680 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 5.201437186659346, |
|
"learning_rate": 5.54677977337306e-06, |
|
"loss": 0.1948, |
|
"step": 24700 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 5.546180531185139, |
|
"learning_rate": 5.530854915808009e-06, |
|
"loss": 0.1632, |
|
"step": 24720 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 5.599419815798738, |
|
"learning_rate": 5.514944216467942e-06, |
|
"loss": 0.173, |
|
"step": 24740 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 6.685290390475231, |
|
"learning_rate": 5.4990477257465854e-06, |
|
"loss": 0.1767, |
|
"step": 24760 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 8.819487702938599, |
|
"learning_rate": 5.483165493992667e-06, |
|
"loss": 0.1491, |
|
"step": 24780 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 3.9679892704495603, |
|
"learning_rate": 5.467297571509735e-06, |
|
"loss": 0.1422, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 6.793517658152576, |
|
"learning_rate": 5.451444008556042e-06, |
|
"loss": 0.1183, |
|
"step": 24820 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 5.961884869528241, |
|
"learning_rate": 5.435604855344332e-06, |
|
"loss": 0.1284, |
|
"step": 24840 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 7.746334808953981, |
|
"learning_rate": 5.419780162041731e-06, |
|
"loss": 0.1081, |
|
"step": 24860 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 4.4474985013962485, |
|
"learning_rate": 5.4039699787695536e-06, |
|
"loss": 0.1347, |
|
"step": 24880 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 5.856732802707876, |
|
"learning_rate": 5.388174355603166e-06, |
|
"loss": 0.1545, |
|
"step": 24900 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 4.924020644664483, |
|
"learning_rate": 5.372393342571808e-06, |
|
"loss": 0.1499, |
|
"step": 24920 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 5.682977240510656, |
|
"learning_rate": 5.356626989658453e-06, |
|
"loss": 0.1246, |
|
"step": 24940 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 5.859264972925292, |
|
"learning_rate": 5.340875346799646e-06, |
|
"loss": 0.1305, |
|
"step": 24960 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 4.623581539899941, |
|
"learning_rate": 5.325138463885324e-06, |
|
"loss": 0.1264, |
|
"step": 24980 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 7.01967036937822, |
|
"learning_rate": 5.309416390758695e-06, |
|
"loss": 0.1069, |
|
"step": 25000 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 37164, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"total_flos": 0.0, |
|
"train_batch_size": 12, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|