bgem3-reranker-e2 / trainer_state.json
nntoan209's picture
Upload folder using huggingface_hub
321e19a verified
raw
history blame contribute delete
No virus
200 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0180412891247754,
"eval_steps": 500,
"global_step": 25000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 7.78010667114874,
"learning_rate": 4.0236686390532546e-07,
"loss": 0.5346,
"step": 20
},
{
"epoch": 0.0,
"grad_norm": 6.216957639101055,
"learning_rate": 6.153846153846155e-07,
"loss": 0.5043,
"step": 40
},
{
"epoch": 0.0,
"grad_norm": 7.990357548396736,
"learning_rate": 8.284023668639055e-07,
"loss": 0.5017,
"step": 60
},
{
"epoch": 0.01,
"grad_norm": 8.12688107356609,
"learning_rate": 1.0414201183431955e-06,
"loss": 0.4952,
"step": 80
},
{
"epoch": 0.01,
"grad_norm": 6.530843475685683,
"learning_rate": 1.2544378698224851e-06,
"loss": 0.5621,
"step": 100
},
{
"epoch": 0.01,
"grad_norm": 6.308467672405027,
"learning_rate": 1.4674556213017752e-06,
"loss": 0.4549,
"step": 120
},
{
"epoch": 0.01,
"grad_norm": 7.243052870190241,
"learning_rate": 1.6804733727810652e-06,
"loss": 0.4466,
"step": 140
},
{
"epoch": 0.01,
"grad_norm": 10.219726515841495,
"learning_rate": 1.8934911242603552e-06,
"loss": 0.3893,
"step": 160
},
{
"epoch": 0.01,
"grad_norm": 6.627649317339657,
"learning_rate": 2.106508875739645e-06,
"loss": 0.4179,
"step": 180
},
{
"epoch": 0.02,
"grad_norm": 6.44244224679364,
"learning_rate": 2.319526627218935e-06,
"loss": 0.4226,
"step": 200
},
{
"epoch": 0.02,
"grad_norm": 6.612950721768246,
"learning_rate": 2.532544378698225e-06,
"loss": 0.3795,
"step": 220
},
{
"epoch": 0.02,
"grad_norm": 6.283517912051673,
"learning_rate": 2.7455621301775153e-06,
"loss": 0.4276,
"step": 240
},
{
"epoch": 0.02,
"grad_norm": 7.268987062349035,
"learning_rate": 2.958579881656805e-06,
"loss": 0.3619,
"step": 260
},
{
"epoch": 0.02,
"grad_norm": 8.340583800596072,
"learning_rate": 3.171597633136095e-06,
"loss": 0.4244,
"step": 280
},
{
"epoch": 0.02,
"grad_norm": 6.75600646477272,
"learning_rate": 3.384615384615385e-06,
"loss": 0.3852,
"step": 300
},
{
"epoch": 0.03,
"grad_norm": 5.647054711391784,
"learning_rate": 3.597633136094675e-06,
"loss": 0.3809,
"step": 320
},
{
"epoch": 0.03,
"grad_norm": 7.253045067435066,
"learning_rate": 3.8106508875739652e-06,
"loss": 0.3858,
"step": 340
},
{
"epoch": 0.03,
"grad_norm": 7.301184351749545,
"learning_rate": 4.023668639053255e-06,
"loss": 0.3549,
"step": 360
},
{
"epoch": 0.03,
"grad_norm": 7.59195138486003,
"learning_rate": 4.236686390532545e-06,
"loss": 0.4048,
"step": 380
},
{
"epoch": 0.03,
"grad_norm": 10.124611929532549,
"learning_rate": 4.449704142011835e-06,
"loss": 0.3646,
"step": 400
},
{
"epoch": 0.03,
"grad_norm": 7.744526068197853,
"learning_rate": 4.662721893491124e-06,
"loss": 0.3677,
"step": 420
},
{
"epoch": 0.04,
"grad_norm": 8.246611853098463,
"learning_rate": 4.875739644970415e-06,
"loss": 0.3573,
"step": 440
},
{
"epoch": 0.04,
"grad_norm": 7.675393597081337,
"learning_rate": 5.088757396449705e-06,
"loss": 0.3714,
"step": 460
},
{
"epoch": 0.04,
"grad_norm": 7.407374558199348,
"learning_rate": 5.301775147928995e-06,
"loss": 0.3762,
"step": 480
},
{
"epoch": 0.04,
"grad_norm": 7.5336943019480875,
"learning_rate": 5.514792899408284e-06,
"loss": 0.3423,
"step": 500
},
{
"epoch": 0.04,
"grad_norm": 6.789944607897793,
"learning_rate": 5.727810650887574e-06,
"loss": 0.3382,
"step": 520
},
{
"epoch": 0.04,
"grad_norm": 6.2048712513712765,
"learning_rate": 5.940828402366864e-06,
"loss": 0.355,
"step": 540
},
{
"epoch": 0.05,
"grad_norm": 7.7676635768181255,
"learning_rate": 6.153846153846153e-06,
"loss": 0.3481,
"step": 560
},
{
"epoch": 0.05,
"grad_norm": 5.33865756273462,
"learning_rate": 6.366863905325444e-06,
"loss": 0.3486,
"step": 580
},
{
"epoch": 0.05,
"grad_norm": 7.215416340807466,
"learning_rate": 6.579881656804735e-06,
"loss": 0.3378,
"step": 600
},
{
"epoch": 0.05,
"grad_norm": 6.563753135562715,
"learning_rate": 6.792899408284025e-06,
"loss": 0.3126,
"step": 620
},
{
"epoch": 0.05,
"grad_norm": 8.45042187241426,
"learning_rate": 7.005917159763315e-06,
"loss": 0.3231,
"step": 640
},
{
"epoch": 0.05,
"grad_norm": 6.734275587016377,
"learning_rate": 7.218934911242604e-06,
"loss": 0.3533,
"step": 660
},
{
"epoch": 0.05,
"grad_norm": 6.417837093997314,
"learning_rate": 7.431952662721894e-06,
"loss": 0.3335,
"step": 680
},
{
"epoch": 0.06,
"grad_norm": 8.615382928114506,
"learning_rate": 7.644970414201183e-06,
"loss": 0.3495,
"step": 700
},
{
"epoch": 0.06,
"grad_norm": 6.6395133490495395,
"learning_rate": 7.857988165680473e-06,
"loss": 0.3371,
"step": 720
},
{
"epoch": 0.06,
"grad_norm": 9.710125890008053,
"learning_rate": 8.071005917159764e-06,
"loss": 0.3619,
"step": 740
},
{
"epoch": 0.06,
"grad_norm": 8.21056721557438,
"learning_rate": 8.284023668639054e-06,
"loss": 0.3479,
"step": 760
},
{
"epoch": 0.06,
"grad_norm": 7.0850591314913185,
"learning_rate": 8.497041420118344e-06,
"loss": 0.3217,
"step": 780
},
{
"epoch": 0.06,
"grad_norm": 8.709374421841343,
"learning_rate": 8.710059171597634e-06,
"loss": 0.345,
"step": 800
},
{
"epoch": 0.07,
"grad_norm": 8.703998514988717,
"learning_rate": 8.923076923076925e-06,
"loss": 0.3544,
"step": 820
},
{
"epoch": 0.07,
"grad_norm": 8.233699532008588,
"learning_rate": 9.136094674556215e-06,
"loss": 0.3449,
"step": 840
},
{
"epoch": 0.07,
"grad_norm": 6.360611479052244,
"learning_rate": 9.349112426035503e-06,
"loss": 0.3709,
"step": 860
},
{
"epoch": 0.07,
"grad_norm": 5.21422760141616,
"learning_rate": 9.562130177514794e-06,
"loss": 0.3031,
"step": 880
},
{
"epoch": 0.07,
"grad_norm": 10.285280127672143,
"learning_rate": 9.775147928994084e-06,
"loss": 0.314,
"step": 900
},
{
"epoch": 0.07,
"grad_norm": 8.561551897783339,
"learning_rate": 9.988165680473372e-06,
"loss": 0.34,
"step": 920
},
{
"epoch": 0.08,
"grad_norm": 8.337797520489195,
"learning_rate": 1.0201183431952664e-05,
"loss": 0.3324,
"step": 940
},
{
"epoch": 0.08,
"grad_norm": 9.531967986532953,
"learning_rate": 1.0414201183431953e-05,
"loss": 0.359,
"step": 960
},
{
"epoch": 0.08,
"grad_norm": 6.913190435381454,
"learning_rate": 1.0627218934911243e-05,
"loss": 0.3715,
"step": 980
},
{
"epoch": 0.08,
"grad_norm": 8.525404719704843,
"learning_rate": 1.0840236686390533e-05,
"loss": 0.2926,
"step": 1000
},
{
"epoch": 0.08,
"grad_norm": 8.186212103501571,
"learning_rate": 1.1053254437869825e-05,
"loss": 0.351,
"step": 1020
},
{
"epoch": 0.08,
"grad_norm": 5.968895396114415,
"learning_rate": 1.1266272189349114e-05,
"loss": 0.3325,
"step": 1040
},
{
"epoch": 0.09,
"grad_norm": 9.069989076248202,
"learning_rate": 1.1479289940828404e-05,
"loss": 0.3075,
"step": 1060
},
{
"epoch": 0.09,
"grad_norm": 6.617321242745247,
"learning_rate": 1.1692307692307694e-05,
"loss": 0.3321,
"step": 1080
},
{
"epoch": 0.09,
"grad_norm": 8.096724387107496,
"learning_rate": 1.1905325443786983e-05,
"loss": 0.3508,
"step": 1100
},
{
"epoch": 0.09,
"grad_norm": 7.972357677530315,
"learning_rate": 1.2118343195266273e-05,
"loss": 0.3031,
"step": 1120
},
{
"epoch": 0.09,
"grad_norm": 7.82798416711515,
"learning_rate": 1.2331360946745563e-05,
"loss": 0.2912,
"step": 1140
},
{
"epoch": 0.09,
"grad_norm": 10.732472169183794,
"learning_rate": 1.2544378698224854e-05,
"loss": 0.2956,
"step": 1160
},
{
"epoch": 0.1,
"grad_norm": 7.4415223600685625,
"learning_rate": 1.2757396449704142e-05,
"loss": 0.3234,
"step": 1180
},
{
"epoch": 0.1,
"grad_norm": 7.006979596197168,
"learning_rate": 1.2970414201183432e-05,
"loss": 0.3271,
"step": 1200
},
{
"epoch": 0.1,
"grad_norm": 8.00254256852378,
"learning_rate": 1.3183431952662723e-05,
"loss": 0.3165,
"step": 1220
},
{
"epoch": 0.1,
"grad_norm": 14.742499561979336,
"learning_rate": 1.3396449704142011e-05,
"loss": 0.3015,
"step": 1240
},
{
"epoch": 0.1,
"grad_norm": 7.14191343340385,
"learning_rate": 1.3609467455621301e-05,
"loss": 0.3273,
"step": 1260
},
{
"epoch": 0.1,
"grad_norm": 5.720523746760943,
"learning_rate": 1.3822485207100593e-05,
"loss": 0.3073,
"step": 1280
},
{
"epoch": 0.1,
"grad_norm": 29.467573544915435,
"learning_rate": 1.4035502958579883e-05,
"loss": 0.2779,
"step": 1300
},
{
"epoch": 0.11,
"grad_norm": 6.058319538140977,
"learning_rate": 1.4248520710059172e-05,
"loss": 0.3176,
"step": 1320
},
{
"epoch": 0.11,
"grad_norm": 5.979724640901607,
"learning_rate": 1.4461538461538462e-05,
"loss": 0.2919,
"step": 1340
},
{
"epoch": 0.11,
"grad_norm": 9.296561099611436,
"learning_rate": 1.4674556213017754e-05,
"loss": 0.3073,
"step": 1360
},
{
"epoch": 0.11,
"grad_norm": 6.766863350601436,
"learning_rate": 1.4887573964497044e-05,
"loss": 0.3449,
"step": 1380
},
{
"epoch": 0.11,
"grad_norm": 9.0738349271763,
"learning_rate": 1.5100591715976333e-05,
"loss": 0.3282,
"step": 1400
},
{
"epoch": 0.11,
"grad_norm": 5.238889472577541,
"learning_rate": 1.5313609467455623e-05,
"loss": 0.2766,
"step": 1420
},
{
"epoch": 0.12,
"grad_norm": 6.068739678701155,
"learning_rate": 1.5526627218934912e-05,
"loss": 0.3527,
"step": 1440
},
{
"epoch": 0.12,
"grad_norm": 8.761831948666881,
"learning_rate": 1.5739644970414204e-05,
"loss": 0.309,
"step": 1460
},
{
"epoch": 0.12,
"grad_norm": 5.696276254834093,
"learning_rate": 1.5952662721893492e-05,
"loss": 0.3334,
"step": 1480
},
{
"epoch": 0.12,
"grad_norm": 5.698639074786848,
"learning_rate": 1.616568047337278e-05,
"loss": 0.306,
"step": 1500
},
{
"epoch": 0.12,
"grad_norm": 6.244148261549131,
"learning_rate": 1.6378698224852073e-05,
"loss": 0.3,
"step": 1520
},
{
"epoch": 0.12,
"grad_norm": 6.948838050368144,
"learning_rate": 1.659171597633136e-05,
"loss": 0.3348,
"step": 1540
},
{
"epoch": 0.13,
"grad_norm": 6.672363835099913,
"learning_rate": 1.6804733727810653e-05,
"loss": 0.3534,
"step": 1560
},
{
"epoch": 0.13,
"grad_norm": 50.29837243421749,
"learning_rate": 1.7017751479289942e-05,
"loss": 0.3203,
"step": 1580
},
{
"epoch": 0.13,
"grad_norm": 7.03929850767991,
"learning_rate": 1.723076923076923e-05,
"loss": 0.297,
"step": 1600
},
{
"epoch": 0.13,
"grad_norm": 7.19469430195841,
"learning_rate": 1.7443786982248522e-05,
"loss": 0.2978,
"step": 1620
},
{
"epoch": 0.13,
"grad_norm": 7.055651559785636,
"learning_rate": 1.765680473372781e-05,
"loss": 0.3313,
"step": 1640
},
{
"epoch": 0.13,
"grad_norm": 7.162934891578056,
"learning_rate": 1.78698224852071e-05,
"loss": 0.3188,
"step": 1660
},
{
"epoch": 0.14,
"grad_norm": 7.176153099629785,
"learning_rate": 1.808284023668639e-05,
"loss": 0.3297,
"step": 1680
},
{
"epoch": 0.14,
"grad_norm": 6.367781797418254,
"learning_rate": 1.8295857988165683e-05,
"loss": 0.3042,
"step": 1700
},
{
"epoch": 0.14,
"grad_norm": 6.373173769997794,
"learning_rate": 1.8508875739644975e-05,
"loss": 0.3266,
"step": 1720
},
{
"epoch": 0.14,
"grad_norm": 18.577261761115807,
"learning_rate": 1.8721893491124264e-05,
"loss": 0.2736,
"step": 1740
},
{
"epoch": 0.14,
"grad_norm": 4.994768648489008,
"learning_rate": 1.8934911242603552e-05,
"loss": 0.3012,
"step": 1760
},
{
"epoch": 0.14,
"grad_norm": 6.301807374403158,
"learning_rate": 1.9147928994082844e-05,
"loss": 0.3394,
"step": 1780
},
{
"epoch": 0.15,
"grad_norm": 7.456967816650055,
"learning_rate": 1.9360946745562133e-05,
"loss": 0.3329,
"step": 1800
},
{
"epoch": 0.15,
"grad_norm": 6.0971107018925625,
"learning_rate": 1.957396449704142e-05,
"loss": 0.3523,
"step": 1820
},
{
"epoch": 0.15,
"grad_norm": 8.10379030061416,
"learning_rate": 1.9786982248520713e-05,
"loss": 0.3013,
"step": 1840
},
{
"epoch": 0.15,
"grad_norm": 7.024454181142386,
"learning_rate": 1.9999999960412883e-05,
"loss": 0.2762,
"step": 1860
},
{
"epoch": 0.15,
"grad_norm": 8.232339625552946,
"learning_rate": 1.9999982542086008e-05,
"loss": 0.3448,
"step": 1880
},
{
"epoch": 0.15,
"grad_norm": 7.602098687169412,
"learning_rate": 1.9999933454128334e-05,
"loss": 0.3398,
"step": 1900
},
{
"epoch": 0.15,
"grad_norm": 13.60330741158854,
"learning_rate": 1.9999852696695326e-05,
"loss": 0.3449,
"step": 1920
},
{
"epoch": 0.16,
"grad_norm": 6.868480851347135,
"learning_rate": 1.9999740270042764e-05,
"loss": 0.3047,
"step": 1940
},
{
"epoch": 0.16,
"grad_norm": 8.419546962701075,
"learning_rate": 1.9999596174526744e-05,
"loss": 0.296,
"step": 1960
},
{
"epoch": 0.16,
"grad_norm": 8.258953437915027,
"learning_rate": 1.9999420410603655e-05,
"loss": 0.3145,
"step": 1980
},
{
"epoch": 0.16,
"grad_norm": 8.591067141592344,
"learning_rate": 1.9999212978830192e-05,
"loss": 0.2967,
"step": 2000
},
{
"epoch": 0.16,
"grad_norm": 7.788694443802249,
"learning_rate": 1.9998973879863347e-05,
"loss": 0.2922,
"step": 2020
},
{
"epoch": 0.16,
"grad_norm": 8.894924525369044,
"learning_rate": 1.999870311446042e-05,
"loss": 0.2909,
"step": 2040
},
{
"epoch": 0.17,
"grad_norm": 7.71785690762546,
"learning_rate": 1.9998400683478994e-05,
"loss": 0.3185,
"step": 2060
},
{
"epoch": 0.17,
"grad_norm": 6.423565941270898,
"learning_rate": 1.9998066587876964e-05,
"loss": 0.331,
"step": 2080
},
{
"epoch": 0.17,
"grad_norm": 9.504973464920754,
"learning_rate": 1.9997700828712502e-05,
"loss": 0.3163,
"step": 2100
},
{
"epoch": 0.17,
"grad_norm": 5.0751162961967555,
"learning_rate": 1.999730340714407e-05,
"loss": 0.2935,
"step": 2120
},
{
"epoch": 0.17,
"grad_norm": 6.788588249402219,
"learning_rate": 1.9996874324430414e-05,
"loss": 0.304,
"step": 2140
},
{
"epoch": 0.17,
"grad_norm": 6.982728358658279,
"learning_rate": 1.9996413581930564e-05,
"loss": 0.3254,
"step": 2160
},
{
"epoch": 0.18,
"grad_norm": 7.39883321894128,
"learning_rate": 1.9995921181103827e-05,
"loss": 0.3238,
"step": 2180
},
{
"epoch": 0.18,
"grad_norm": 6.349466388539233,
"learning_rate": 1.999539712350977e-05,
"loss": 0.3138,
"step": 2200
},
{
"epoch": 0.18,
"grad_norm": 8.460848680137447,
"learning_rate": 1.9994841410808238e-05,
"loss": 0.2951,
"step": 2220
},
{
"epoch": 0.18,
"grad_norm": 12.228980891015102,
"learning_rate": 1.999425404475933e-05,
"loss": 0.313,
"step": 2240
},
{
"epoch": 0.18,
"grad_norm": 10.128152992141587,
"learning_rate": 1.99936350272234e-05,
"loss": 0.3209,
"step": 2260
},
{
"epoch": 0.18,
"grad_norm": 7.366413095980619,
"learning_rate": 1.999298436016105e-05,
"loss": 0.3508,
"step": 2280
},
{
"epoch": 0.19,
"grad_norm": 5.935510884433497,
"learning_rate": 1.9992302045633138e-05,
"loss": 0.3087,
"step": 2300
},
{
"epoch": 0.19,
"grad_norm": 9.917097921103624,
"learning_rate": 1.9991588085800745e-05,
"loss": 0.3272,
"step": 2320
},
{
"epoch": 0.19,
"grad_norm": 8.158864972330328,
"learning_rate": 1.9990842482925183e-05,
"loss": 0.3097,
"step": 2340
},
{
"epoch": 0.19,
"grad_norm": 5.660258353439845,
"learning_rate": 1.999006523936799e-05,
"loss": 0.3194,
"step": 2360
},
{
"epoch": 0.19,
"grad_norm": 6.343908148236521,
"learning_rate": 1.9989256357590915e-05,
"loss": 0.3144,
"step": 2380
},
{
"epoch": 0.19,
"grad_norm": 6.935522124005399,
"learning_rate": 1.9988415840155925e-05,
"loss": 0.316,
"step": 2400
},
{
"epoch": 0.2,
"grad_norm": 6.118420550913593,
"learning_rate": 1.9987543689725172e-05,
"loss": 0.2935,
"step": 2420
},
{
"epoch": 0.2,
"grad_norm": 5.852760915435719,
"learning_rate": 1.998663990906101e-05,
"loss": 0.2982,
"step": 2440
},
{
"epoch": 0.2,
"grad_norm": 10.637858430267903,
"learning_rate": 1.9985704501025967e-05,
"loss": 0.3263,
"step": 2460
},
{
"epoch": 0.2,
"grad_norm": 5.969298652078407,
"learning_rate": 1.9984737468582746e-05,
"loss": 0.2785,
"step": 2480
},
{
"epoch": 0.2,
"grad_norm": 7.290609868079292,
"learning_rate": 1.998373881479422e-05,
"loss": 0.2902,
"step": 2500
},
{
"epoch": 0.2,
"grad_norm": 8.419267863068479,
"learning_rate": 1.9982708542823405e-05,
"loss": 0.2854,
"step": 2520
},
{
"epoch": 0.21,
"grad_norm": 8.123890492905641,
"learning_rate": 1.9981646655933466e-05,
"loss": 0.2981,
"step": 2540
},
{
"epoch": 0.21,
"grad_norm": 5.10058603098674,
"learning_rate": 1.998055315748771e-05,
"loss": 0.2792,
"step": 2560
},
{
"epoch": 0.21,
"grad_norm": 5.6118366078785105,
"learning_rate": 1.997942805094955e-05,
"loss": 0.2905,
"step": 2580
},
{
"epoch": 0.21,
"grad_norm": 11.075016161215812,
"learning_rate": 1.997827133988252e-05,
"loss": 0.2902,
"step": 2600
},
{
"epoch": 0.21,
"grad_norm": 8.887207232453743,
"learning_rate": 1.997708302795026e-05,
"loss": 0.3155,
"step": 2620
},
{
"epoch": 0.21,
"grad_norm": 8.39711790141671,
"learning_rate": 1.997586311891649e-05,
"loss": 0.286,
"step": 2640
},
{
"epoch": 0.21,
"grad_norm": 5.439201702560111,
"learning_rate": 1.9974611616645007e-05,
"loss": 0.2933,
"step": 2660
},
{
"epoch": 0.22,
"grad_norm": 7.182735290178756,
"learning_rate": 1.9973328525099675e-05,
"loss": 0.3267,
"step": 2680
},
{
"epoch": 0.22,
"grad_norm": 8.170254081594555,
"learning_rate": 1.997201384834442e-05,
"loss": 0.2967,
"step": 2700
},
{
"epoch": 0.22,
"grad_norm": 6.357829888020736,
"learning_rate": 1.997066759054319e-05,
"loss": 0.3109,
"step": 2720
},
{
"epoch": 0.22,
"grad_norm": 6.180030398494684,
"learning_rate": 1.996928975595997e-05,
"loss": 0.3054,
"step": 2740
},
{
"epoch": 0.22,
"grad_norm": 7.812651853992933,
"learning_rate": 1.996788034895875e-05,
"loss": 0.2852,
"step": 2760
},
{
"epoch": 0.22,
"grad_norm": 6.492759192826664,
"learning_rate": 1.9966439374003538e-05,
"loss": 0.305,
"step": 2780
},
{
"epoch": 0.23,
"grad_norm": 11.941328303638311,
"learning_rate": 1.99649668356583e-05,
"loss": 0.2922,
"step": 2800
},
{
"epoch": 0.23,
"grad_norm": 8.314661648415811,
"learning_rate": 1.9963462738586993e-05,
"loss": 0.3102,
"step": 2820
},
{
"epoch": 0.23,
"grad_norm": 7.301474320450444,
"learning_rate": 1.996192708755351e-05,
"loss": 0.2964,
"step": 2840
},
{
"epoch": 0.23,
"grad_norm": 8.797389284755965,
"learning_rate": 1.996035988742171e-05,
"loss": 0.2656,
"step": 2860
},
{
"epoch": 0.23,
"grad_norm": 6.614687108304631,
"learning_rate": 1.9958761143155357e-05,
"loss": 0.2927,
"step": 2880
},
{
"epoch": 0.23,
"grad_norm": 7.623906291629947,
"learning_rate": 1.995713085981813e-05,
"loss": 0.2788,
"step": 2900
},
{
"epoch": 0.24,
"grad_norm": 5.684942655651583,
"learning_rate": 1.9955469042573605e-05,
"loss": 0.3051,
"step": 2920
},
{
"epoch": 0.24,
"grad_norm": 8.632124459996572,
"learning_rate": 1.9953775696685223e-05,
"loss": 0.3002,
"step": 2940
},
{
"epoch": 0.24,
"grad_norm": 4.942563514745161,
"learning_rate": 1.99520508275163e-05,
"loss": 0.2862,
"step": 2960
},
{
"epoch": 0.24,
"grad_norm": 6.712590574651518,
"learning_rate": 1.995029444052999e-05,
"loss": 0.2938,
"step": 2980
},
{
"epoch": 0.24,
"grad_norm": 7.09759309831908,
"learning_rate": 1.9948506541289266e-05,
"loss": 0.3054,
"step": 3000
},
{
"epoch": 0.24,
"grad_norm": 9.042408105595795,
"learning_rate": 1.994668713545692e-05,
"loss": 0.3041,
"step": 3020
},
{
"epoch": 0.25,
"grad_norm": 6.843804903550461,
"learning_rate": 1.994483622879553e-05,
"loss": 0.2958,
"step": 3040
},
{
"epoch": 0.25,
"grad_norm": 8.109882081629157,
"learning_rate": 1.9942953827167443e-05,
"loss": 0.3115,
"step": 3060
},
{
"epoch": 0.25,
"grad_norm": 6.519937602019556,
"learning_rate": 1.994103993653476e-05,
"loss": 0.2873,
"step": 3080
},
{
"epoch": 0.25,
"grad_norm": 43.825640526729615,
"learning_rate": 1.9939094562959324e-05,
"loss": 0.3084,
"step": 3100
},
{
"epoch": 0.25,
"grad_norm": 6.740302289754918,
"learning_rate": 1.993711771260268e-05,
"loss": 0.2898,
"step": 3120
},
{
"epoch": 0.25,
"grad_norm": 8.585263920916868,
"learning_rate": 1.993510939172609e-05,
"loss": 0.303,
"step": 3140
},
{
"epoch": 0.26,
"grad_norm": 7.715090235382078,
"learning_rate": 1.9933069606690468e-05,
"loss": 0.3102,
"step": 3160
},
{
"epoch": 0.26,
"grad_norm": 9.780531981807941,
"learning_rate": 1.99309983639564e-05,
"loss": 0.3077,
"step": 3180
},
{
"epoch": 0.26,
"grad_norm": 4.606128027451412,
"learning_rate": 1.99288956700841e-05,
"loss": 0.3131,
"step": 3200
},
{
"epoch": 0.26,
"grad_norm": 6.5456142622794875,
"learning_rate": 1.9926761531733403e-05,
"loss": 0.2899,
"step": 3220
},
{
"epoch": 0.26,
"grad_norm": 6.881966685047346,
"learning_rate": 1.9924595955663732e-05,
"loss": 0.2834,
"step": 3240
},
{
"epoch": 0.26,
"grad_norm": 6.086009895569889,
"learning_rate": 1.9922398948734088e-05,
"loss": 0.2887,
"step": 3260
},
{
"epoch": 0.26,
"grad_norm": 5.329585705771699,
"learning_rate": 1.992017051790301e-05,
"loss": 0.2888,
"step": 3280
},
{
"epoch": 0.27,
"grad_norm": 9.3500534790468,
"learning_rate": 1.991791067022858e-05,
"loss": 0.3168,
"step": 3300
},
{
"epoch": 0.27,
"grad_norm": 6.741688450171789,
"learning_rate": 1.9915619412868387e-05,
"loss": 0.2703,
"step": 3320
},
{
"epoch": 0.27,
"grad_norm": 6.86462812934889,
"learning_rate": 1.9913296753079484e-05,
"loss": 0.3141,
"step": 3340
},
{
"epoch": 0.27,
"grad_norm": 6.6699035733643495,
"learning_rate": 1.9910942698218404e-05,
"loss": 0.2922,
"step": 3360
},
{
"epoch": 0.27,
"grad_norm": 8.43685481112505,
"learning_rate": 1.990855725574111e-05,
"loss": 0.2836,
"step": 3380
},
{
"epoch": 0.27,
"grad_norm": 10.06638161800925,
"learning_rate": 1.990614043320298e-05,
"loss": 0.2949,
"step": 3400
},
{
"epoch": 0.28,
"grad_norm": 8.362504433942911,
"learning_rate": 1.9903692238258783e-05,
"loss": 0.2897,
"step": 3420
},
{
"epoch": 0.28,
"grad_norm": 6.788699791177713,
"learning_rate": 1.9901212678662646e-05,
"loss": 0.2907,
"step": 3440
},
{
"epoch": 0.28,
"grad_norm": 6.40238575575375,
"learning_rate": 1.989870176226804e-05,
"loss": 0.2609,
"step": 3460
},
{
"epoch": 0.28,
"grad_norm": 5.227206933131435,
"learning_rate": 1.9896159497027758e-05,
"loss": 0.3162,
"step": 3480
},
{
"epoch": 0.28,
"grad_norm": 6.191089860311128,
"learning_rate": 1.9893585890993877e-05,
"loss": 0.2998,
"step": 3500
},
{
"epoch": 0.28,
"grad_norm": 8.156421644955156,
"learning_rate": 1.9890980952317745e-05,
"loss": 0.2683,
"step": 3520
},
{
"epoch": 0.29,
"grad_norm": 7.62639892752842,
"learning_rate": 1.9888344689249945e-05,
"loss": 0.3138,
"step": 3540
},
{
"epoch": 0.29,
"grad_norm": 8.10913885283575,
"learning_rate": 1.9885677110140272e-05,
"loss": 0.3098,
"step": 3560
},
{
"epoch": 0.29,
"grad_norm": 5.974197538110473,
"learning_rate": 1.988297822343771e-05,
"loss": 0.2879,
"step": 3580
},
{
"epoch": 0.29,
"grad_norm": 8.13170124417466,
"learning_rate": 1.9880248037690406e-05,
"loss": 0.2741,
"step": 3600
},
{
"epoch": 0.29,
"grad_norm": 5.373939941911109,
"learning_rate": 1.9877486561545635e-05,
"loss": 0.2818,
"step": 3620
},
{
"epoch": 0.29,
"grad_norm": 6.876975035910139,
"learning_rate": 1.9874693803749786e-05,
"loss": 0.2872,
"step": 3640
},
{
"epoch": 0.3,
"grad_norm": 11.88859663115872,
"learning_rate": 1.987186977314831e-05,
"loss": 0.2787,
"step": 3660
},
{
"epoch": 0.3,
"grad_norm": 5.296482127875842,
"learning_rate": 1.9869014478685726e-05,
"loss": 0.3125,
"step": 3680
},
{
"epoch": 0.3,
"grad_norm": 10.902431223896663,
"learning_rate": 1.986612792940556e-05,
"loss": 0.2696,
"step": 3700
},
{
"epoch": 0.3,
"grad_norm": 7.957172435618448,
"learning_rate": 1.986321013445034e-05,
"loss": 0.2846,
"step": 3720
},
{
"epoch": 0.3,
"grad_norm": 5.49530713404051,
"learning_rate": 1.9860261103061555e-05,
"loss": 0.2904,
"step": 3740
},
{
"epoch": 0.3,
"grad_norm": 6.7681775640908315,
"learning_rate": 1.985728084457963e-05,
"loss": 0.2907,
"step": 3760
},
{
"epoch": 0.31,
"grad_norm": 11.417291183282801,
"learning_rate": 1.9854269368443898e-05,
"loss": 0.3124,
"step": 3780
},
{
"epoch": 0.31,
"grad_norm": 9.165271676007183,
"learning_rate": 1.985122668419255e-05,
"loss": 0.2938,
"step": 3800
},
{
"epoch": 0.31,
"grad_norm": 9.710590629489802,
"learning_rate": 1.984815280146265e-05,
"loss": 0.2805,
"step": 3820
},
{
"epoch": 0.31,
"grad_norm": 10.32416184835814,
"learning_rate": 1.9845047729990052e-05,
"loss": 0.2939,
"step": 3840
},
{
"epoch": 0.31,
"grad_norm": 6.123004510419631,
"learning_rate": 1.984191147960941e-05,
"loss": 0.3217,
"step": 3860
},
{
"epoch": 0.31,
"grad_norm": 8.419418288045916,
"learning_rate": 1.9838744060254113e-05,
"loss": 0.2466,
"step": 3880
},
{
"epoch": 0.31,
"grad_norm": 8.941869987837809,
"learning_rate": 1.9835545481956295e-05,
"loss": 0.3091,
"step": 3900
},
{
"epoch": 0.32,
"grad_norm": 6.854852736746462,
"learning_rate": 1.983231575484676e-05,
"loss": 0.3094,
"step": 3920
},
{
"epoch": 0.32,
"grad_norm": 10.162127205743055,
"learning_rate": 1.9829054889154978e-05,
"loss": 0.2988,
"step": 3940
},
{
"epoch": 0.32,
"grad_norm": 6.1276753090877385,
"learning_rate": 1.982576289520904e-05,
"loss": 0.2875,
"step": 3960
},
{
"epoch": 0.32,
"grad_norm": 6.806977159453115,
"learning_rate": 1.982243978343562e-05,
"loss": 0.2943,
"step": 3980
},
{
"epoch": 0.32,
"grad_norm": 6.9055487505442015,
"learning_rate": 1.9819085564359977e-05,
"loss": 0.2911,
"step": 4000
},
{
"epoch": 0.32,
"grad_norm": 7.466453294884225,
"learning_rate": 1.9815700248605875e-05,
"loss": 0.2902,
"step": 4020
},
{
"epoch": 0.33,
"grad_norm": 5.488844395318609,
"learning_rate": 1.9812283846895572e-05,
"loss": 0.2773,
"step": 4040
},
{
"epoch": 0.33,
"grad_norm": 5.492586688406755,
"learning_rate": 1.9808836370049786e-05,
"loss": 0.2942,
"step": 4060
},
{
"epoch": 0.33,
"grad_norm": 6.870365314571275,
"learning_rate": 1.980535782898766e-05,
"loss": 0.3134,
"step": 4080
},
{
"epoch": 0.33,
"grad_norm": 6.474349542297636,
"learning_rate": 1.9801848234726733e-05,
"loss": 0.278,
"step": 4100
},
{
"epoch": 0.33,
"grad_norm": 7.02153354250866,
"learning_rate": 1.9798307598382887e-05,
"loss": 0.3008,
"step": 4120
},
{
"epoch": 0.33,
"grad_norm": 6.5342549251431725,
"learning_rate": 1.9794735931170323e-05,
"loss": 0.2588,
"step": 4140
},
{
"epoch": 0.34,
"grad_norm": 7.235161691162515,
"learning_rate": 1.9791133244401536e-05,
"loss": 0.2892,
"step": 4160
},
{
"epoch": 0.34,
"grad_norm": 6.613883714897734,
"learning_rate": 1.978749954948726e-05,
"loss": 0.3042,
"step": 4180
},
{
"epoch": 0.34,
"grad_norm": 5.588985182579549,
"learning_rate": 1.978383485793645e-05,
"loss": 0.2895,
"step": 4200
},
{
"epoch": 0.34,
"grad_norm": 41.788686405813685,
"learning_rate": 1.9780139181356223e-05,
"loss": 0.2967,
"step": 4220
},
{
"epoch": 0.34,
"grad_norm": 8.000329487691184,
"learning_rate": 1.9776412531451845e-05,
"loss": 0.3068,
"step": 4240
},
{
"epoch": 0.34,
"grad_norm": 8.858664509374336,
"learning_rate": 1.977265492002667e-05,
"loss": 0.2904,
"step": 4260
},
{
"epoch": 0.35,
"grad_norm": 6.412322055660321,
"learning_rate": 1.9768866358982138e-05,
"loss": 0.302,
"step": 4280
},
{
"epoch": 0.35,
"grad_norm": 6.15402072878952,
"learning_rate": 1.9765046860317697e-05,
"loss": 0.2753,
"step": 4300
},
{
"epoch": 0.35,
"grad_norm": 7.363823390602094,
"learning_rate": 1.9761196436130792e-05,
"loss": 0.3077,
"step": 4320
},
{
"epoch": 0.35,
"grad_norm": 5.820012641709484,
"learning_rate": 1.9757315098616813e-05,
"loss": 0.3024,
"step": 4340
},
{
"epoch": 0.35,
"grad_norm": 5.291771334516593,
"learning_rate": 1.975340286006906e-05,
"loss": 0.2732,
"step": 4360
},
{
"epoch": 0.35,
"grad_norm": 5.880570388428466,
"learning_rate": 1.9749459732878716e-05,
"loss": 0.2491,
"step": 4380
},
{
"epoch": 0.36,
"grad_norm": 5.655270317760537,
"learning_rate": 1.9745485729534788e-05,
"loss": 0.2803,
"step": 4400
},
{
"epoch": 0.36,
"grad_norm": 6.013092379821028,
"learning_rate": 1.974148086262408e-05,
"loss": 0.2803,
"step": 4420
},
{
"epoch": 0.36,
"grad_norm": 5.7211652147787975,
"learning_rate": 1.9737445144831136e-05,
"loss": 0.2637,
"step": 4440
},
{
"epoch": 0.36,
"grad_norm": 8.131187547800137,
"learning_rate": 1.973337858893824e-05,
"loss": 0.3255,
"step": 4460
},
{
"epoch": 0.36,
"grad_norm": 7.137552013307909,
"learning_rate": 1.972928120782533e-05,
"loss": 0.2668,
"step": 4480
},
{
"epoch": 0.36,
"grad_norm": 5.150009725617049,
"learning_rate": 1.972515301446998e-05,
"loss": 0.2854,
"step": 4500
},
{
"epoch": 0.36,
"grad_norm": 6.331424258094408,
"learning_rate": 1.972099402194736e-05,
"loss": 0.2866,
"step": 4520
},
{
"epoch": 0.37,
"grad_norm": 8.392412446366174,
"learning_rate": 1.9716804243430176e-05,
"loss": 0.2616,
"step": 4540
},
{
"epoch": 0.37,
"grad_norm": 7.299549339702017,
"learning_rate": 1.971258369218867e-05,
"loss": 0.2983,
"step": 4560
},
{
"epoch": 0.37,
"grad_norm": 6.2169745129545575,
"learning_rate": 1.970833238159051e-05,
"loss": 0.276,
"step": 4580
},
{
"epoch": 0.37,
"grad_norm": 7.506632605972847,
"learning_rate": 1.9704050325100827e-05,
"loss": 0.2951,
"step": 4600
},
{
"epoch": 0.37,
"grad_norm": 6.66483208527068,
"learning_rate": 1.969973753628211e-05,
"loss": 0.2784,
"step": 4620
},
{
"epoch": 0.37,
"grad_norm": 7.211925104193477,
"learning_rate": 1.9695394028794195e-05,
"loss": 0.2729,
"step": 4640
},
{
"epoch": 0.38,
"grad_norm": 4.20991234694906,
"learning_rate": 1.9691019816394204e-05,
"loss": 0.3152,
"step": 4660
},
{
"epoch": 0.38,
"grad_norm": 5.975539574939649,
"learning_rate": 1.9686614912936516e-05,
"loss": 0.2747,
"step": 4680
},
{
"epoch": 0.38,
"grad_norm": 6.135748637813934,
"learning_rate": 1.968217933237272e-05,
"loss": 0.3028,
"step": 4700
},
{
"epoch": 0.38,
"grad_norm": 8.994912298940163,
"learning_rate": 1.9677713088751562e-05,
"loss": 0.3043,
"step": 4720
},
{
"epoch": 0.38,
"grad_norm": 7.649871286543558,
"learning_rate": 1.967321619621892e-05,
"loss": 0.2577,
"step": 4740
},
{
"epoch": 0.38,
"grad_norm": 6.035703921853307,
"learning_rate": 1.9668688669017722e-05,
"loss": 0.2596,
"step": 4760
},
{
"epoch": 0.39,
"grad_norm": 5.4070428696843615,
"learning_rate": 1.9664130521487946e-05,
"loss": 0.2885,
"step": 4780
},
{
"epoch": 0.39,
"grad_norm": 9.68037240943506,
"learning_rate": 1.9659541768066545e-05,
"loss": 0.2739,
"step": 4800
},
{
"epoch": 0.39,
"grad_norm": 7.032775442165197,
"learning_rate": 1.965492242328741e-05,
"loss": 0.2832,
"step": 4820
},
{
"epoch": 0.39,
"grad_norm": 7.038266627020968,
"learning_rate": 1.9650272501781326e-05,
"loss": 0.3053,
"step": 4840
},
{
"epoch": 0.39,
"grad_norm": 4.469246363249616,
"learning_rate": 1.9645592018275917e-05,
"loss": 0.2922,
"step": 4860
},
{
"epoch": 0.39,
"grad_norm": 9.997476259295432,
"learning_rate": 1.964088098759561e-05,
"loss": 0.3029,
"step": 4880
},
{
"epoch": 0.4,
"grad_norm": 4.1359256203786705,
"learning_rate": 1.9636139424661588e-05,
"loss": 0.2885,
"step": 4900
},
{
"epoch": 0.4,
"grad_norm": 6.857022727186512,
"learning_rate": 1.9631367344491735e-05,
"loss": 0.263,
"step": 4920
},
{
"epoch": 0.4,
"grad_norm": 6.720261230840821,
"learning_rate": 1.9626564762200583e-05,
"loss": 0.3083,
"step": 4940
},
{
"epoch": 0.4,
"grad_norm": 7.113731977499931,
"learning_rate": 1.9621731692999284e-05,
"loss": 0.2789,
"step": 4960
},
{
"epoch": 0.4,
"grad_norm": 6.665634774774537,
"learning_rate": 1.961686815219555e-05,
"loss": 0.2591,
"step": 4980
},
{
"epoch": 0.4,
"grad_norm": 4.599220599612653,
"learning_rate": 1.9611974155193597e-05,
"loss": 0.2753,
"step": 5000
},
{
"epoch": 0.41,
"grad_norm": 6.564880520618788,
"learning_rate": 1.960704971749411e-05,
"loss": 0.2805,
"step": 5020
},
{
"epoch": 0.41,
"grad_norm": 7.418663470463415,
"learning_rate": 1.9602094854694194e-05,
"loss": 0.2782,
"step": 5040
},
{
"epoch": 0.41,
"grad_norm": 5.491847672130194,
"learning_rate": 1.9597109582487313e-05,
"loss": 0.2702,
"step": 5060
},
{
"epoch": 0.41,
"grad_norm": 12.43956214256869,
"learning_rate": 1.9592093916663242e-05,
"loss": 0.2972,
"step": 5080
},
{
"epoch": 0.41,
"grad_norm": 8.800236820155485,
"learning_rate": 1.958704787310803e-05,
"loss": 0.2725,
"step": 5100
},
{
"epoch": 0.41,
"grad_norm": 6.644759742176537,
"learning_rate": 1.9581971467803934e-05,
"loss": 0.289,
"step": 5120
},
{
"epoch": 0.41,
"grad_norm": 5.009336147526538,
"learning_rate": 1.9576864716829377e-05,
"loss": 0.2969,
"step": 5140
},
{
"epoch": 0.42,
"grad_norm": 5.803503477935393,
"learning_rate": 1.95717276363589e-05,
"loss": 0.2774,
"step": 5160
},
{
"epoch": 0.42,
"grad_norm": 5.894993950320594,
"learning_rate": 1.95665602426631e-05,
"loss": 0.2273,
"step": 5180
},
{
"epoch": 0.42,
"grad_norm": 6.96986305003759,
"learning_rate": 1.956136255210859e-05,
"loss": 0.2736,
"step": 5200
},
{
"epoch": 0.42,
"grad_norm": 9.605041419937288,
"learning_rate": 1.955613458115793e-05,
"loss": 0.2907,
"step": 5220
},
{
"epoch": 0.42,
"grad_norm": 6.647244804794919,
"learning_rate": 1.9550876346369615e-05,
"loss": 0.261,
"step": 5240
},
{
"epoch": 0.42,
"grad_norm": 5.607026269896423,
"learning_rate": 1.9545587864397955e-05,
"loss": 0.3143,
"step": 5260
},
{
"epoch": 0.43,
"grad_norm": 13.142033450455475,
"learning_rate": 1.954026915199309e-05,
"loss": 0.2434,
"step": 5280
},
{
"epoch": 0.43,
"grad_norm": 3.248788167531875,
"learning_rate": 1.9534920226000902e-05,
"loss": 0.2705,
"step": 5300
},
{
"epoch": 0.43,
"grad_norm": 13.164180730181236,
"learning_rate": 1.9529541103362962e-05,
"loss": 0.2862,
"step": 5320
},
{
"epoch": 0.43,
"grad_norm": 5.831970091880435,
"learning_rate": 1.9524131801116487e-05,
"loss": 0.3054,
"step": 5340
},
{
"epoch": 0.43,
"grad_norm": 5.446448668681817,
"learning_rate": 1.951869233639428e-05,
"loss": 0.2671,
"step": 5360
},
{
"epoch": 0.43,
"grad_norm": 6.302315229032403,
"learning_rate": 1.951322272642468e-05,
"loss": 0.2765,
"step": 5380
},
{
"epoch": 0.44,
"grad_norm": 11.243376074569383,
"learning_rate": 1.9507722988531502e-05,
"loss": 0.2582,
"step": 5400
},
{
"epoch": 0.44,
"grad_norm": 7.673489589122099,
"learning_rate": 1.9502193140133983e-05,
"loss": 0.3143,
"step": 5420
},
{
"epoch": 0.44,
"grad_norm": 7.9949329541838265,
"learning_rate": 1.9496633198746736e-05,
"loss": 0.2862,
"step": 5440
},
{
"epoch": 0.44,
"grad_norm": 4.907903307657898,
"learning_rate": 1.9491043181979677e-05,
"loss": 0.2926,
"step": 5460
},
{
"epoch": 0.44,
"grad_norm": 8.67569902665337,
"learning_rate": 1.9485423107537986e-05,
"loss": 0.2741,
"step": 5480
},
{
"epoch": 0.44,
"grad_norm": 7.103858232561379,
"learning_rate": 1.9479772993222038e-05,
"loss": 0.2767,
"step": 5500
},
{
"epoch": 0.45,
"grad_norm": 8.73970254143099,
"learning_rate": 1.947409285692736e-05,
"loss": 0.232,
"step": 5520
},
{
"epoch": 0.45,
"grad_norm": 5.799420807956918,
"learning_rate": 1.946838271664457e-05,
"loss": 0.286,
"step": 5540
},
{
"epoch": 0.45,
"grad_norm": 5.575038878985263,
"learning_rate": 1.9462642590459306e-05,
"loss": 0.2361,
"step": 5560
},
{
"epoch": 0.45,
"grad_norm": 5.718902752613272,
"learning_rate": 1.9456872496552184e-05,
"loss": 0.2781,
"step": 5580
},
{
"epoch": 0.45,
"grad_norm": 6.156497233891847,
"learning_rate": 1.9451072453198742e-05,
"loss": 0.2798,
"step": 5600
},
{
"epoch": 0.45,
"grad_norm": 8.158434812741351,
"learning_rate": 1.9445242478769374e-05,
"loss": 0.2629,
"step": 5620
},
{
"epoch": 0.46,
"grad_norm": 4.100053898526847,
"learning_rate": 1.9439382591729265e-05,
"loss": 0.2616,
"step": 5640
},
{
"epoch": 0.46,
"grad_norm": 10.398486291325238,
"learning_rate": 1.9433492810638355e-05,
"loss": 0.281,
"step": 5660
},
{
"epoch": 0.46,
"grad_norm": 7.054531497011973,
"learning_rate": 1.942757315415126e-05,
"loss": 0.2899,
"step": 5680
},
{
"epoch": 0.46,
"grad_norm": 5.42866324225203,
"learning_rate": 1.9421623641017218e-05,
"loss": 0.3102,
"step": 5700
},
{
"epoch": 0.46,
"grad_norm": 7.133905299895763,
"learning_rate": 1.941564429008004e-05,
"loss": 0.2616,
"step": 5720
},
{
"epoch": 0.46,
"grad_norm": 6.043327611859842,
"learning_rate": 1.9409635120278035e-05,
"loss": 0.2614,
"step": 5740
},
{
"epoch": 0.46,
"grad_norm": 7.125098244508386,
"learning_rate": 1.9403596150643957e-05,
"loss": 0.2732,
"step": 5760
},
{
"epoch": 0.47,
"grad_norm": 7.809924868565428,
"learning_rate": 1.9397527400304944e-05,
"loss": 0.2537,
"step": 5780
},
{
"epoch": 0.47,
"grad_norm": 6.721114625225486,
"learning_rate": 1.9391428888482466e-05,
"loss": 0.2935,
"step": 5800
},
{
"epoch": 0.47,
"grad_norm": 5.41764363012664,
"learning_rate": 1.9385300634492244e-05,
"loss": 0.2644,
"step": 5820
},
{
"epoch": 0.47,
"grad_norm": 6.577839969793495,
"learning_rate": 1.937914265774421e-05,
"loss": 0.2822,
"step": 5840
},
{
"epoch": 0.47,
"grad_norm": 5.446653006796048,
"learning_rate": 1.9372954977742437e-05,
"loss": 0.2767,
"step": 5860
},
{
"epoch": 0.47,
"grad_norm": 6.276051357995197,
"learning_rate": 1.9366737614085067e-05,
"loss": 0.2693,
"step": 5880
},
{
"epoch": 0.48,
"grad_norm": 5.1174083399984935,
"learning_rate": 1.9360490586464265e-05,
"loss": 0.2968,
"step": 5900
},
{
"epoch": 0.48,
"grad_norm": 8.060800772518713,
"learning_rate": 1.9354213914666154e-05,
"loss": 0.3042,
"step": 5920
},
{
"epoch": 0.48,
"grad_norm": 6.946903213944759,
"learning_rate": 1.934790761857074e-05,
"loss": 0.2896,
"step": 5940
},
{
"epoch": 0.48,
"grad_norm": 6.9058705221323855,
"learning_rate": 1.934157171815187e-05,
"loss": 0.2697,
"step": 5960
},
{
"epoch": 0.48,
"grad_norm": 6.140873173559298,
"learning_rate": 1.9335206233477138e-05,
"loss": 0.3012,
"step": 5980
},
{
"epoch": 0.48,
"grad_norm": 6.2613108710926415,
"learning_rate": 1.9328811184707857e-05,
"loss": 0.2616,
"step": 6000
},
{
"epoch": 0.49,
"grad_norm": 7.103058610195544,
"learning_rate": 1.932238659209897e-05,
"loss": 0.2476,
"step": 6020
},
{
"epoch": 0.49,
"grad_norm": 7.721454841424903,
"learning_rate": 1.9315932475998994e-05,
"loss": 0.2772,
"step": 6040
},
{
"epoch": 0.49,
"grad_norm": 7.2979455852743245,
"learning_rate": 1.930944885684996e-05,
"loss": 0.2463,
"step": 6060
},
{
"epoch": 0.49,
"grad_norm": 5.194835577851161,
"learning_rate": 1.9302935755187335e-05,
"loss": 0.2595,
"step": 6080
},
{
"epoch": 0.49,
"grad_norm": 8.826615087967348,
"learning_rate": 1.9296393191639976e-05,
"loss": 0.2728,
"step": 6100
},
{
"epoch": 0.49,
"grad_norm": 4.8811686686091,
"learning_rate": 1.9289821186930038e-05,
"loss": 0.2998,
"step": 6120
},
{
"epoch": 0.5,
"grad_norm": 6.458493860362177,
"learning_rate": 1.9283219761872943e-05,
"loss": 0.2608,
"step": 6140
},
{
"epoch": 0.5,
"grad_norm": 7.035199086314618,
"learning_rate": 1.9276588937377293e-05,
"loss": 0.2789,
"step": 6160
},
{
"epoch": 0.5,
"grad_norm": 5.133893925330738,
"learning_rate": 1.9269928734444792e-05,
"loss": 0.2858,
"step": 6180
},
{
"epoch": 0.5,
"grad_norm": 6.888562228890644,
"learning_rate": 1.9263239174170203e-05,
"loss": 0.263,
"step": 6200
},
{
"epoch": 0.5,
"grad_norm": 7.247178153358964,
"learning_rate": 1.9256520277741276e-05,
"loss": 0.2887,
"step": 6220
},
{
"epoch": 0.5,
"grad_norm": 5.8972079378636755,
"learning_rate": 1.9249772066438676e-05,
"loss": 0.2693,
"step": 6240
},
{
"epoch": 0.51,
"grad_norm": 4.962455745470868,
"learning_rate": 1.924299456163591e-05,
"loss": 0.2749,
"step": 6260
},
{
"epoch": 0.51,
"grad_norm": 7.258216936978947,
"learning_rate": 1.9236187784799267e-05,
"loss": 0.2957,
"step": 6280
},
{
"epoch": 0.51,
"grad_norm": 6.194723517380506,
"learning_rate": 1.9229351757487757e-05,
"loss": 0.2773,
"step": 6300
},
{
"epoch": 0.51,
"grad_norm": 6.5263758820129505,
"learning_rate": 1.9222486501353027e-05,
"loss": 0.293,
"step": 6320
},
{
"epoch": 0.51,
"grad_norm": 6.366073941639377,
"learning_rate": 1.9215592038139296e-05,
"loss": 0.2755,
"step": 6340
},
{
"epoch": 0.51,
"grad_norm": 116.03864354978906,
"learning_rate": 1.9208668389683308e-05,
"loss": 0.251,
"step": 6360
},
{
"epoch": 0.52,
"grad_norm": 5.264420821601751,
"learning_rate": 1.9201715577914223e-05,
"loss": 0.2845,
"step": 6380
},
{
"epoch": 0.52,
"grad_norm": 7.40581166704595,
"learning_rate": 1.9194733624853584e-05,
"loss": 0.2632,
"step": 6400
},
{
"epoch": 0.52,
"grad_norm": 5.941729193844859,
"learning_rate": 1.918772255261523e-05,
"loss": 0.2597,
"step": 6420
},
{
"epoch": 0.52,
"grad_norm": 6.312855927658963,
"learning_rate": 1.9180682383405227e-05,
"loss": 0.2692,
"step": 6440
},
{
"epoch": 0.52,
"grad_norm": 6.071350558765477,
"learning_rate": 1.9173613139521798e-05,
"loss": 0.2731,
"step": 6460
},
{
"epoch": 0.52,
"grad_norm": 8.993502072541519,
"learning_rate": 1.9166514843355254e-05,
"loss": 0.2548,
"step": 6480
},
{
"epoch": 0.52,
"grad_norm": 7.005495911170442,
"learning_rate": 1.9159387517387924e-05,
"loss": 0.2612,
"step": 6500
},
{
"epoch": 0.53,
"grad_norm": 7.665844444744844,
"learning_rate": 1.915223118419409e-05,
"loss": 0.2501,
"step": 6520
},
{
"epoch": 0.53,
"grad_norm": 6.458334088069092,
"learning_rate": 1.9145045866439892e-05,
"loss": 0.2762,
"step": 6540
},
{
"epoch": 0.53,
"grad_norm": 6.819348453934704,
"learning_rate": 1.9137831586883288e-05,
"loss": 0.2826,
"step": 6560
},
{
"epoch": 0.53,
"grad_norm": 5.080353890354994,
"learning_rate": 1.9130588368373958e-05,
"loss": 0.2738,
"step": 6580
},
{
"epoch": 0.53,
"grad_norm": 5.836344060052037,
"learning_rate": 1.912331623385324e-05,
"loss": 0.2586,
"step": 6600
},
{
"epoch": 0.53,
"grad_norm": 8.357013119166787,
"learning_rate": 1.9116015206354067e-05,
"loss": 0.3174,
"step": 6620
},
{
"epoch": 0.54,
"grad_norm": 4.307353833021694,
"learning_rate": 1.9108685309000866e-05,
"loss": 0.2721,
"step": 6640
},
{
"epoch": 0.54,
"grad_norm": 7.002584797605542,
"learning_rate": 1.9101326565009517e-05,
"loss": 0.2581,
"step": 6660
},
{
"epoch": 0.54,
"grad_norm": 7.299065402050334,
"learning_rate": 1.909393899768726e-05,
"loss": 0.2933,
"step": 6680
},
{
"epoch": 0.54,
"grad_norm": 6.57009941780665,
"learning_rate": 1.9086522630432638e-05,
"loss": 0.2843,
"step": 6700
},
{
"epoch": 0.54,
"grad_norm": 6.857683881351832,
"learning_rate": 1.907907748673539e-05,
"loss": 0.273,
"step": 6720
},
{
"epoch": 0.54,
"grad_norm": 5.553536898181894,
"learning_rate": 1.9071603590176417e-05,
"loss": 0.2623,
"step": 6740
},
{
"epoch": 0.55,
"grad_norm": 6.30566096206076,
"learning_rate": 1.906410096442768e-05,
"loss": 0.2366,
"step": 6760
},
{
"epoch": 0.55,
"grad_norm": 10.352010603508685,
"learning_rate": 1.9056569633252136e-05,
"loss": 0.2546,
"step": 6780
},
{
"epoch": 0.55,
"grad_norm": 7.80077709337333,
"learning_rate": 1.9049009620503663e-05,
"loss": 0.2763,
"step": 6800
},
{
"epoch": 0.55,
"grad_norm": 8.03821543687826,
"learning_rate": 1.9041420950126976e-05,
"loss": 0.2486,
"step": 6820
},
{
"epoch": 0.55,
"grad_norm": 5.390540434685423,
"learning_rate": 1.9033803646157558e-05,
"loss": 0.2964,
"step": 6840
},
{
"epoch": 0.55,
"grad_norm": 7.430208109717449,
"learning_rate": 1.9026157732721585e-05,
"loss": 0.2681,
"step": 6860
},
{
"epoch": 0.56,
"grad_norm": 7.857246125768761,
"learning_rate": 1.9018483234035845e-05,
"loss": 0.2719,
"step": 6880
},
{
"epoch": 0.56,
"grad_norm": 5.565927371122475,
"learning_rate": 1.901078017440767e-05,
"loss": 0.2703,
"step": 6900
},
{
"epoch": 0.56,
"grad_norm": 6.9976710123560375,
"learning_rate": 1.9003048578234843e-05,
"loss": 0.2566,
"step": 6920
},
{
"epoch": 0.56,
"grad_norm": 6.478229745256355,
"learning_rate": 1.899528847000554e-05,
"loss": 0.253,
"step": 6940
},
{
"epoch": 0.56,
"grad_norm": 5.116336291620225,
"learning_rate": 1.898749987429823e-05,
"loss": 0.2529,
"step": 6960
},
{
"epoch": 0.56,
"grad_norm": 5.587216274900646,
"learning_rate": 1.8979682815781627e-05,
"loss": 0.2848,
"step": 6980
},
{
"epoch": 0.57,
"grad_norm": 5.679239499396368,
"learning_rate": 1.8971837319214586e-05,
"loss": 0.2435,
"step": 7000
},
{
"epoch": 0.57,
"grad_norm": 6.1169436663169074,
"learning_rate": 1.8963963409446022e-05,
"loss": 0.2793,
"step": 7020
},
{
"epoch": 0.57,
"grad_norm": 5.802383972086084,
"learning_rate": 1.8956061111414865e-05,
"loss": 0.2717,
"step": 7040
},
{
"epoch": 0.57,
"grad_norm": 6.507485407821351,
"learning_rate": 1.8948130450149942e-05,
"loss": 0.3011,
"step": 7060
},
{
"epoch": 0.57,
"grad_norm": 5.9621692749685415,
"learning_rate": 1.8940171450769924e-05,
"loss": 0.3076,
"step": 7080
},
{
"epoch": 0.57,
"grad_norm": 54.193219403324576,
"learning_rate": 1.8932184138483223e-05,
"loss": 0.2629,
"step": 7100
},
{
"epoch": 0.57,
"grad_norm": 7.104622864455409,
"learning_rate": 1.8924168538587956e-05,
"loss": 0.2714,
"step": 7120
},
{
"epoch": 0.58,
"grad_norm": 7.0480173880874615,
"learning_rate": 1.8916124676471797e-05,
"loss": 0.2736,
"step": 7140
},
{
"epoch": 0.58,
"grad_norm": 8.412429750327739,
"learning_rate": 1.8908052577611958e-05,
"loss": 0.2644,
"step": 7160
},
{
"epoch": 0.58,
"grad_norm": 8.78692031185526,
"learning_rate": 1.8899952267575083e-05,
"loss": 0.2402,
"step": 7180
},
{
"epoch": 0.58,
"grad_norm": 6.684638721458758,
"learning_rate": 1.889182377201716e-05,
"loss": 0.2542,
"step": 7200
},
{
"epoch": 0.58,
"grad_norm": 6.010627553359556,
"learning_rate": 1.8883667116683457e-05,
"loss": 0.2838,
"step": 7220
},
{
"epoch": 0.58,
"grad_norm": 7.414666524064716,
"learning_rate": 1.887548232740843e-05,
"loss": 0.2851,
"step": 7240
},
{
"epoch": 0.59,
"grad_norm": 8.34076720224061,
"learning_rate": 1.886726943011564e-05,
"loss": 0.2516,
"step": 7260
},
{
"epoch": 0.59,
"grad_norm": 6.094640422225146,
"learning_rate": 1.885902845081767e-05,
"loss": 0.2313,
"step": 7280
},
{
"epoch": 0.59,
"grad_norm": 4.148097252407384,
"learning_rate": 1.8850759415616066e-05,
"loss": 0.2689,
"step": 7300
},
{
"epoch": 0.59,
"grad_norm": 7.561235558598041,
"learning_rate": 1.8842462350701212e-05,
"loss": 0.2983,
"step": 7320
},
{
"epoch": 0.59,
"grad_norm": 7.87091273248736,
"learning_rate": 1.883413728235228e-05,
"loss": 0.2386,
"step": 7340
},
{
"epoch": 0.59,
"grad_norm": 6.197625708495748,
"learning_rate": 1.8825784236937146e-05,
"loss": 0.282,
"step": 7360
},
{
"epoch": 0.6,
"grad_norm": 9.524108011926973,
"learning_rate": 1.8817403240912283e-05,
"loss": 0.2776,
"step": 7380
},
{
"epoch": 0.6,
"grad_norm": 5.147943823246307,
"learning_rate": 1.8808994320822693e-05,
"loss": 0.2625,
"step": 7400
},
{
"epoch": 0.6,
"grad_norm": 5.764109011612628,
"learning_rate": 1.8800557503301827e-05,
"loss": 0.2859,
"step": 7420
},
{
"epoch": 0.6,
"grad_norm": 7.7970156609149335,
"learning_rate": 1.8792092815071498e-05,
"loss": 0.2589,
"step": 7440
},
{
"epoch": 0.6,
"grad_norm": 6.220123292737489,
"learning_rate": 1.8783600282941782e-05,
"loss": 0.269,
"step": 7460
},
{
"epoch": 0.6,
"grad_norm": 6.875941264134116,
"learning_rate": 1.877507993381096e-05,
"loss": 0.2624,
"step": 7480
},
{
"epoch": 0.61,
"grad_norm": 5.721394912188018,
"learning_rate": 1.8766531794665402e-05,
"loss": 0.2571,
"step": 7500
},
{
"epoch": 0.61,
"grad_norm": 6.99318335916291,
"learning_rate": 1.8757955892579504e-05,
"loss": 0.26,
"step": 7520
},
{
"epoch": 0.61,
"grad_norm": 6.692727585899676,
"learning_rate": 1.87493522547156e-05,
"loss": 0.2635,
"step": 7540
},
{
"epoch": 0.61,
"grad_norm": 6.5007755110350525,
"learning_rate": 1.874072090832386e-05,
"loss": 0.2754,
"step": 7560
},
{
"epoch": 0.61,
"grad_norm": 7.775379340923738,
"learning_rate": 1.873206188074223e-05,
"loss": 0.2708,
"step": 7580
},
{
"epoch": 0.61,
"grad_norm": 4.970941791912674,
"learning_rate": 1.872337519939631e-05,
"loss": 0.2592,
"step": 7600
},
{
"epoch": 0.62,
"grad_norm": 7.276189396167904,
"learning_rate": 1.8714660891799302e-05,
"loss": 0.2648,
"step": 7620
},
{
"epoch": 0.62,
"grad_norm": 6.724776718800752,
"learning_rate": 1.870591898555191e-05,
"loss": 0.2606,
"step": 7640
},
{
"epoch": 0.62,
"grad_norm": 7.735768695454274,
"learning_rate": 1.8697149508342237e-05,
"loss": 0.2511,
"step": 7660
},
{
"epoch": 0.62,
"grad_norm": 5.05914779633595,
"learning_rate": 1.868835248794573e-05,
"loss": 0.2609,
"step": 7680
},
{
"epoch": 0.62,
"grad_norm": 8.031314122281715,
"learning_rate": 1.8679527952225054e-05,
"loss": 0.2718,
"step": 7700
},
{
"epoch": 0.62,
"grad_norm": 5.619780768194464,
"learning_rate": 1.867067592913004e-05,
"loss": 0.2717,
"step": 7720
},
{
"epoch": 0.62,
"grad_norm": 7.595427904662886,
"learning_rate": 1.8661796446697557e-05,
"loss": 0.2536,
"step": 7740
},
{
"epoch": 0.63,
"grad_norm": 5.462276616537402,
"learning_rate": 1.8652889533051473e-05,
"loss": 0.2674,
"step": 7760
},
{
"epoch": 0.63,
"grad_norm": 5.302383844019715,
"learning_rate": 1.864395521640252e-05,
"loss": 0.2856,
"step": 7780
},
{
"epoch": 0.63,
"grad_norm": 4.703940083284321,
"learning_rate": 1.8634993525048227e-05,
"loss": 0.2609,
"step": 7800
},
{
"epoch": 0.63,
"grad_norm": 6.585961827134786,
"learning_rate": 1.862600448737283e-05,
"loss": 0.265,
"step": 7820
},
{
"epoch": 0.63,
"grad_norm": 7.27689896277283,
"learning_rate": 1.861698813184717e-05,
"loss": 0.3018,
"step": 7840
},
{
"epoch": 0.63,
"grad_norm": 6.231232809733686,
"learning_rate": 1.860794448702863e-05,
"loss": 0.2268,
"step": 7860
},
{
"epoch": 0.64,
"grad_norm": 7.794911353272152,
"learning_rate": 1.8598873581561e-05,
"loss": 0.2632,
"step": 7880
},
{
"epoch": 0.64,
"grad_norm": 6.977335614708055,
"learning_rate": 1.8589775444174436e-05,
"loss": 0.3097,
"step": 7900
},
{
"epoch": 0.64,
"grad_norm": 7.607942857642037,
"learning_rate": 1.858065010368533e-05,
"loss": 0.2658,
"step": 7920
},
{
"epoch": 0.64,
"grad_norm": 6.109669397778123,
"learning_rate": 1.857149758899624e-05,
"loss": 0.2613,
"step": 7940
},
{
"epoch": 0.64,
"grad_norm": 6.142102090556645,
"learning_rate": 1.8562317929095796e-05,
"loss": 0.2769,
"step": 7960
},
{
"epoch": 0.64,
"grad_norm": 3.98370343700879,
"learning_rate": 1.8553111153058593e-05,
"loss": 0.2642,
"step": 7980
},
{
"epoch": 0.65,
"grad_norm": 6.375900504146025,
"learning_rate": 1.8543877290045122e-05,
"loss": 0.2646,
"step": 8000
},
{
"epoch": 0.65,
"grad_norm": 7.277577534154136,
"learning_rate": 1.853461636930166e-05,
"loss": 0.2806,
"step": 8020
},
{
"epoch": 0.65,
"grad_norm": 6.81435963858201,
"learning_rate": 1.852532842016019e-05,
"loss": 0.2536,
"step": 8040
},
{
"epoch": 0.65,
"grad_norm": 5.854006003712663,
"learning_rate": 1.851601347203829e-05,
"loss": 0.2447,
"step": 8060
},
{
"epoch": 0.65,
"grad_norm": 7.787886275359923,
"learning_rate": 1.8506671554439064e-05,
"loss": 0.2663,
"step": 8080
},
{
"epoch": 0.65,
"grad_norm": 9.21089898409568,
"learning_rate": 1.849730269695103e-05,
"loss": 0.2601,
"step": 8100
},
{
"epoch": 0.66,
"grad_norm": 8.590889527489873,
"learning_rate": 1.8487906929248028e-05,
"loss": 0.2531,
"step": 8120
},
{
"epoch": 0.66,
"grad_norm": 5.551684548356732,
"learning_rate": 1.8478484281089143e-05,
"loss": 0.2605,
"step": 8140
},
{
"epoch": 0.66,
"grad_norm": 4.227848217032472,
"learning_rate": 1.8469034782318585e-05,
"loss": 0.2728,
"step": 8160
},
{
"epoch": 0.66,
"grad_norm": 6.015758166139706,
"learning_rate": 1.8459558462865613e-05,
"loss": 0.2883,
"step": 8180
},
{
"epoch": 0.66,
"grad_norm": 6.568658173678755,
"learning_rate": 1.845005535274444e-05,
"loss": 0.2454,
"step": 8200
},
{
"epoch": 0.66,
"grad_norm": 5.985752674217696,
"learning_rate": 1.844052548205412e-05,
"loss": 0.2442,
"step": 8220
},
{
"epoch": 0.67,
"grad_norm": 6.3219678524060425,
"learning_rate": 1.843096888097848e-05,
"loss": 0.2912,
"step": 8240
},
{
"epoch": 0.67,
"grad_norm": 4.096257730243316,
"learning_rate": 1.8421385579785997e-05,
"loss": 0.2636,
"step": 8260
},
{
"epoch": 0.67,
"grad_norm": 6.396648972118899,
"learning_rate": 1.8411775608829722e-05,
"loss": 0.2324,
"step": 8280
},
{
"epoch": 0.67,
"grad_norm": 4.782379216505,
"learning_rate": 1.8402138998547174e-05,
"loss": 0.2675,
"step": 8300
},
{
"epoch": 0.67,
"grad_norm": 8.676707198167653,
"learning_rate": 1.839247577946025e-05,
"loss": 0.2843,
"step": 8320
},
{
"epoch": 0.67,
"grad_norm": 5.39138478992206,
"learning_rate": 1.8382785982175118e-05,
"loss": 0.2742,
"step": 8340
},
{
"epoch": 0.67,
"grad_norm": 5.818443622984385,
"learning_rate": 1.8373069637382136e-05,
"loss": 0.26,
"step": 8360
},
{
"epoch": 0.68,
"grad_norm": 8.95366226368456,
"learning_rate": 1.8363326775855737e-05,
"loss": 0.2687,
"step": 8380
},
{
"epoch": 0.68,
"grad_norm": 7.96756080281063,
"learning_rate": 1.8353557428454346e-05,
"loss": 0.2425,
"step": 8400
},
{
"epoch": 0.68,
"grad_norm": 6.577104865413394,
"learning_rate": 1.8343761626120272e-05,
"loss": 0.2688,
"step": 8420
},
{
"epoch": 0.68,
"grad_norm": 6.6269283727065,
"learning_rate": 1.8333939399879617e-05,
"loss": 0.2808,
"step": 8440
},
{
"epoch": 0.68,
"grad_norm": 7.016348140974161,
"learning_rate": 1.8324090780842173e-05,
"loss": 0.2511,
"step": 8460
},
{
"epoch": 0.68,
"grad_norm": 7.5454363034081116,
"learning_rate": 1.831421580020133e-05,
"loss": 0.252,
"step": 8480
},
{
"epoch": 0.69,
"grad_norm": 5.837760589468463,
"learning_rate": 1.830431448923396e-05,
"loss": 0.2728,
"step": 8500
},
{
"epoch": 0.69,
"grad_norm": 6.154380243306325,
"learning_rate": 1.8294386879300353e-05,
"loss": 0.2867,
"step": 8520
},
{
"epoch": 0.69,
"grad_norm": 7.71122937485844,
"learning_rate": 1.8284433001844073e-05,
"loss": 0.2302,
"step": 8540
},
{
"epoch": 0.69,
"grad_norm": 6.86335128201322,
"learning_rate": 1.8274452888391894e-05,
"loss": 0.2586,
"step": 8560
},
{
"epoch": 0.69,
"grad_norm": 5.661853354206643,
"learning_rate": 1.8264446570553682e-05,
"loss": 0.2505,
"step": 8580
},
{
"epoch": 0.69,
"grad_norm": 5.982364804963667,
"learning_rate": 1.82544140800223e-05,
"loss": 0.2673,
"step": 8600
},
{
"epoch": 0.7,
"grad_norm": 4.739028708176796,
"learning_rate": 1.824435544857351e-05,
"loss": 0.2678,
"step": 8620
},
{
"epoch": 0.7,
"grad_norm": 4.91420533377473,
"learning_rate": 1.823427070806587e-05,
"loss": 0.2559,
"step": 8640
},
{
"epoch": 0.7,
"grad_norm": 5.618249360419533,
"learning_rate": 1.8224159890440623e-05,
"loss": 0.2493,
"step": 8660
},
{
"epoch": 0.7,
"grad_norm": 5.896677808188606,
"learning_rate": 1.821402302772162e-05,
"loss": 0.2585,
"step": 8680
},
{
"epoch": 0.7,
"grad_norm": 6.073985124124518,
"learning_rate": 1.82038601520152e-05,
"loss": 0.2452,
"step": 8700
},
{
"epoch": 0.7,
"grad_norm": 7.1459209410818,
"learning_rate": 1.819367129551008e-05,
"loss": 0.2592,
"step": 8720
},
{
"epoch": 0.71,
"grad_norm": 6.390094315335785,
"learning_rate": 1.8183456490477287e-05,
"loss": 0.2461,
"step": 8740
},
{
"epoch": 0.71,
"grad_norm": 5.294426005863845,
"learning_rate": 1.8173215769270015e-05,
"loss": 0.2685,
"step": 8760
},
{
"epoch": 0.71,
"grad_norm": 5.276924483715485,
"learning_rate": 1.8162949164323554e-05,
"loss": 0.2615,
"step": 8780
},
{
"epoch": 0.71,
"grad_norm": 7.331765382932756,
"learning_rate": 1.8152656708155173e-05,
"loss": 0.2828,
"step": 8800
},
{
"epoch": 0.71,
"grad_norm": 5.361402122667844,
"learning_rate": 1.8142338433364012e-05,
"loss": 0.2849,
"step": 8820
},
{
"epoch": 0.71,
"grad_norm": 6.712375473487036,
"learning_rate": 1.8131994372630995e-05,
"loss": 0.2716,
"step": 8840
},
{
"epoch": 0.72,
"grad_norm": 8.103353922148388,
"learning_rate": 1.812162455871872e-05,
"loss": 0.2703,
"step": 8860
},
{
"epoch": 0.72,
"grad_norm": 4.585974100152074,
"learning_rate": 1.8111229024471334e-05,
"loss": 0.2386,
"step": 8880
},
{
"epoch": 0.72,
"grad_norm": 6.8332489132512375,
"learning_rate": 1.8100807802814467e-05,
"loss": 0.2935,
"step": 8900
},
{
"epoch": 0.72,
"grad_norm": 5.556964992180211,
"learning_rate": 1.80903609267551e-05,
"loss": 0.2404,
"step": 8920
},
{
"epoch": 0.72,
"grad_norm": 6.524527124099894,
"learning_rate": 1.8079888429381472e-05,
"loss": 0.2477,
"step": 8940
},
{
"epoch": 0.72,
"grad_norm": 6.394125877212817,
"learning_rate": 1.8069390343862972e-05,
"loss": 0.2585,
"step": 8960
},
{
"epoch": 0.72,
"grad_norm": 7.212304875264878,
"learning_rate": 1.805886670345003e-05,
"loss": 0.2514,
"step": 8980
},
{
"epoch": 0.73,
"grad_norm": 5.915336602662839,
"learning_rate": 1.8048317541474015e-05,
"loss": 0.2554,
"step": 9000
},
{
"epoch": 0.73,
"grad_norm": 6.204874325324116,
"learning_rate": 1.803774289134714e-05,
"loss": 0.2663,
"step": 9020
},
{
"epoch": 0.73,
"grad_norm": 4.9458264028130525,
"learning_rate": 1.8027142786562334e-05,
"loss": 0.2374,
"step": 9040
},
{
"epoch": 0.73,
"grad_norm": 5.66437734846908,
"learning_rate": 1.8016517260693152e-05,
"loss": 0.2173,
"step": 9060
},
{
"epoch": 0.73,
"grad_norm": 8.8145498502476,
"learning_rate": 1.800586634739367e-05,
"loss": 0.2672,
"step": 9080
},
{
"epoch": 0.73,
"grad_norm": 5.225621616310874,
"learning_rate": 1.799519008039837e-05,
"loss": 0.263,
"step": 9100
},
{
"epoch": 0.74,
"grad_norm": 6.749141497235558,
"learning_rate": 1.7984488493522033e-05,
"loss": 0.294,
"step": 9120
},
{
"epoch": 0.74,
"grad_norm": 6.5925500148457115,
"learning_rate": 1.7973761620659645e-05,
"loss": 0.2549,
"step": 9140
},
{
"epoch": 0.74,
"grad_norm": 3.6612011894705097,
"learning_rate": 1.7963009495786262e-05,
"loss": 0.274,
"step": 9160
},
{
"epoch": 0.74,
"grad_norm": 7.730637018917763,
"learning_rate": 1.795223215295694e-05,
"loss": 0.2476,
"step": 9180
},
{
"epoch": 0.74,
"grad_norm": 5.253387992852078,
"learning_rate": 1.7941429626306597e-05,
"loss": 0.2557,
"step": 9200
},
{
"epoch": 0.74,
"grad_norm": 6.185451592355014,
"learning_rate": 1.7930601950049918e-05,
"loss": 0.2414,
"step": 9220
},
{
"epoch": 0.75,
"grad_norm": 8.613330410148825,
"learning_rate": 1.7919749158481238e-05,
"loss": 0.252,
"step": 9240
},
{
"epoch": 0.75,
"grad_norm": 4.082779957130279,
"learning_rate": 1.7908871285974452e-05,
"loss": 0.246,
"step": 9260
},
{
"epoch": 0.75,
"grad_norm": 5.080789002249157,
"learning_rate": 1.789796836698288e-05,
"loss": 0.2241,
"step": 9280
},
{
"epoch": 0.75,
"grad_norm": 5.616004872409631,
"learning_rate": 1.788704043603918e-05,
"loss": 0.2635,
"step": 9300
},
{
"epoch": 0.75,
"grad_norm": 4.6896605535132005,
"learning_rate": 1.787608752775523e-05,
"loss": 0.2496,
"step": 9320
},
{
"epoch": 0.75,
"grad_norm": 6.020003669712304,
"learning_rate": 1.786510967682201e-05,
"loss": 0.2742,
"step": 9340
},
{
"epoch": 0.76,
"grad_norm": 4.869330214670387,
"learning_rate": 1.7854106918009516e-05,
"loss": 0.2554,
"step": 9360
},
{
"epoch": 0.76,
"grad_norm": 5.463125770044224,
"learning_rate": 1.7843079286166613e-05,
"loss": 0.256,
"step": 9380
},
{
"epoch": 0.76,
"grad_norm": 12.859151326084799,
"learning_rate": 1.7832026816220964e-05,
"loss": 0.3044,
"step": 9400
},
{
"epoch": 0.76,
"grad_norm": 7.462079888408213,
"learning_rate": 1.7820949543178893e-05,
"loss": 0.2603,
"step": 9420
},
{
"epoch": 0.76,
"grad_norm": 6.251675190537996,
"learning_rate": 1.7809847502125287e-05,
"loss": 0.2524,
"step": 9440
},
{
"epoch": 0.76,
"grad_norm": 20.20686096910179,
"learning_rate": 1.779872072822348e-05,
"loss": 0.2727,
"step": 9460
},
{
"epoch": 0.77,
"grad_norm": 11.117280832355938,
"learning_rate": 1.7787569256715128e-05,
"loss": 0.2751,
"step": 9480
},
{
"epoch": 0.77,
"grad_norm": 6.174365967852932,
"learning_rate": 1.7776393122920136e-05,
"loss": 0.2465,
"step": 9500
},
{
"epoch": 0.77,
"grad_norm": 6.5845686642808205,
"learning_rate": 1.7765192362236505e-05,
"loss": 0.2637,
"step": 9520
},
{
"epoch": 0.77,
"grad_norm": 9.227894944405277,
"learning_rate": 1.775396701014024e-05,
"loss": 0.2594,
"step": 9540
},
{
"epoch": 0.77,
"grad_norm": 6.0294211980015255,
"learning_rate": 1.7742717102185233e-05,
"loss": 0.2506,
"step": 9560
},
{
"epoch": 0.77,
"grad_norm": 6.611585459356701,
"learning_rate": 1.7731442674003153e-05,
"loss": 0.256,
"step": 9580
},
{
"epoch": 0.77,
"grad_norm": 6.474013099428535,
"learning_rate": 1.772014376130333e-05,
"loss": 0.2509,
"step": 9600
},
{
"epoch": 0.78,
"grad_norm": 4.050917561517386,
"learning_rate": 1.7708820399872644e-05,
"loss": 0.2597,
"step": 9620
},
{
"epoch": 0.78,
"grad_norm": 7.523512541811629,
"learning_rate": 1.7697472625575415e-05,
"loss": 0.2617,
"step": 9640
},
{
"epoch": 0.78,
"grad_norm": 4.674855993980255,
"learning_rate": 1.768610047435328e-05,
"loss": 0.2148,
"step": 9660
},
{
"epoch": 0.78,
"grad_norm": 3.581193699152847,
"learning_rate": 1.7674703982225084e-05,
"loss": 0.2485,
"step": 9680
},
{
"epoch": 0.78,
"grad_norm": 5.995347444394187,
"learning_rate": 1.7663283185286778e-05,
"loss": 0.2504,
"step": 9700
},
{
"epoch": 0.78,
"grad_norm": 6.106039165812286,
"learning_rate": 1.7651838119711278e-05,
"loss": 0.2591,
"step": 9720
},
{
"epoch": 0.79,
"grad_norm": 5.544368037680747,
"learning_rate": 1.7640368821748374e-05,
"loss": 0.2589,
"step": 9740
},
{
"epoch": 0.79,
"grad_norm": 11.908781488384356,
"learning_rate": 1.7628875327724604e-05,
"loss": 0.24,
"step": 9760
},
{
"epoch": 0.79,
"grad_norm": 5.2162186199664005,
"learning_rate": 1.761735767404314e-05,
"loss": 0.279,
"step": 9780
},
{
"epoch": 0.79,
"grad_norm": 8.332009731717408,
"learning_rate": 1.760581589718369e-05,
"loss": 0.2523,
"step": 9800
},
{
"epoch": 0.79,
"grad_norm": 6.811834460305066,
"learning_rate": 1.759425003370234e-05,
"loss": 0.2422,
"step": 9820
},
{
"epoch": 0.79,
"grad_norm": 10.001650864708848,
"learning_rate": 1.758266012023149e-05,
"loss": 0.2415,
"step": 9840
},
{
"epoch": 0.8,
"grad_norm": 14.181135321229519,
"learning_rate": 1.7571046193479697e-05,
"loss": 0.2439,
"step": 9860
},
{
"epoch": 0.8,
"grad_norm": 5.304371617930666,
"learning_rate": 1.7559408290231582e-05,
"loss": 0.2883,
"step": 9880
},
{
"epoch": 0.8,
"grad_norm": 10.159891549680514,
"learning_rate": 1.754774644734771e-05,
"loss": 0.2402,
"step": 9900
},
{
"epoch": 0.8,
"grad_norm": 21.596871665189294,
"learning_rate": 1.753606070176446e-05,
"loss": 0.2646,
"step": 9920
},
{
"epoch": 0.8,
"grad_norm": 3.6266946448855064,
"learning_rate": 1.752435109049392e-05,
"loss": 0.2463,
"step": 9940
},
{
"epoch": 0.8,
"grad_norm": 7.461139967802549,
"learning_rate": 1.7512617650623776e-05,
"loss": 0.2343,
"step": 9960
},
{
"epoch": 0.81,
"grad_norm": 5.8844648373593955,
"learning_rate": 1.7500860419317183e-05,
"loss": 0.251,
"step": 9980
},
{
"epoch": 0.81,
"grad_norm": 9.038354738793856,
"learning_rate": 1.7489079433812638e-05,
"loss": 0.2494,
"step": 10000
},
{
"epoch": 0.81,
"grad_norm": 8.591404154257724,
"learning_rate": 1.7477274731423892e-05,
"loss": 0.2374,
"step": 10020
},
{
"epoch": 0.81,
"grad_norm": 5.9870710947999815,
"learning_rate": 1.7465446349539797e-05,
"loss": 0.2206,
"step": 10040
},
{
"epoch": 0.81,
"grad_norm": 6.228813578147013,
"learning_rate": 1.7453594325624224e-05,
"loss": 0.2462,
"step": 10060
},
{
"epoch": 0.81,
"grad_norm": 5.257017078287017,
"learning_rate": 1.7441718697215904e-05,
"loss": 0.2409,
"step": 10080
},
{
"epoch": 0.82,
"grad_norm": 6.952956019716318,
"learning_rate": 1.742981950192835e-05,
"loss": 0.2521,
"step": 10100
},
{
"epoch": 0.82,
"grad_norm": 5.5548892299756805,
"learning_rate": 1.7417896777449706e-05,
"loss": 0.2647,
"step": 10120
},
{
"epoch": 0.82,
"grad_norm": 5.73273030739662,
"learning_rate": 1.7405950561542636e-05,
"loss": 0.2473,
"step": 10140
},
{
"epoch": 0.82,
"grad_norm": 5.8226292447674775,
"learning_rate": 1.7393980892044222e-05,
"loss": 0.2799,
"step": 10160
},
{
"epoch": 0.82,
"grad_norm": 6.573153903103647,
"learning_rate": 1.738198780686582e-05,
"loss": 0.2391,
"step": 10180
},
{
"epoch": 0.82,
"grad_norm": 6.2081294015592094,
"learning_rate": 1.7369971343992953e-05,
"loss": 0.2441,
"step": 10200
},
{
"epoch": 0.82,
"grad_norm": 7.239395541675969,
"learning_rate": 1.735793154148519e-05,
"loss": 0.2467,
"step": 10220
},
{
"epoch": 0.83,
"grad_norm": 6.574019720880623,
"learning_rate": 1.7345868437476016e-05,
"loss": 0.2742,
"step": 10240
},
{
"epoch": 0.83,
"grad_norm": 3.932079883792344,
"learning_rate": 1.733378207017273e-05,
"loss": 0.2799,
"step": 10260
},
{
"epoch": 0.83,
"grad_norm": 7.965596611059161,
"learning_rate": 1.7321672477856297e-05,
"loss": 0.268,
"step": 10280
},
{
"epoch": 0.83,
"grad_norm": 6.637332593742831,
"learning_rate": 1.730953969888126e-05,
"loss": 0.281,
"step": 10300
},
{
"epoch": 0.83,
"grad_norm": 4.598400020154981,
"learning_rate": 1.729738377167559e-05,
"loss": 0.2688,
"step": 10320
},
{
"epoch": 0.83,
"grad_norm": 10.008276375495472,
"learning_rate": 1.728520473474057e-05,
"loss": 0.2424,
"step": 10340
},
{
"epoch": 0.84,
"grad_norm": 9.609588968019253,
"learning_rate": 1.7273002626650693e-05,
"loss": 0.2562,
"step": 10360
},
{
"epoch": 0.84,
"grad_norm": 6.246946580790647,
"learning_rate": 1.726077748605352e-05,
"loss": 0.2536,
"step": 10380
},
{
"epoch": 0.84,
"grad_norm": 5.207954250527354,
"learning_rate": 1.724852935166955e-05,
"loss": 0.2803,
"step": 10400
},
{
"epoch": 0.84,
"grad_norm": 6.83554630577102,
"learning_rate": 1.723625826229212e-05,
"loss": 0.2366,
"step": 10420
},
{
"epoch": 0.84,
"grad_norm": 5.2741649888827,
"learning_rate": 1.7223964256787275e-05,
"loss": 0.2589,
"step": 10440
},
{
"epoch": 0.84,
"grad_norm": 4.504793580943435,
"learning_rate": 1.7211647374093644e-05,
"loss": 0.2654,
"step": 10460
},
{
"epoch": 0.85,
"grad_norm": 5.074320615196733,
"learning_rate": 1.71993076532223e-05,
"loss": 0.2531,
"step": 10480
},
{
"epoch": 0.85,
"grad_norm": 7.4921309833960645,
"learning_rate": 1.7186945133256663e-05,
"loss": 0.2452,
"step": 10500
},
{
"epoch": 0.85,
"grad_norm": 4.773435701909952,
"learning_rate": 1.7174559853352366e-05,
"loss": 0.2786,
"step": 10520
},
{
"epoch": 0.85,
"grad_norm": 5.190944401366304,
"learning_rate": 1.7162151852737114e-05,
"loss": 0.2082,
"step": 10540
},
{
"epoch": 0.85,
"grad_norm": 6.8860794956428215,
"learning_rate": 1.7149721170710597e-05,
"loss": 0.2593,
"step": 10560
},
{
"epoch": 0.85,
"grad_norm": 5.315969613200098,
"learning_rate": 1.7137267846644324e-05,
"loss": 0.2451,
"step": 10580
},
{
"epoch": 0.86,
"grad_norm": 8.924983723943493,
"learning_rate": 1.712479191998153e-05,
"loss": 0.2487,
"step": 10600
},
{
"epoch": 0.86,
"grad_norm": 4.785603454868163,
"learning_rate": 1.711229343023703e-05,
"loss": 0.275,
"step": 10620
},
{
"epoch": 0.86,
"grad_norm": 4.5511584473505895,
"learning_rate": 1.709977241699711e-05,
"loss": 0.2438,
"step": 10640
},
{
"epoch": 0.86,
"grad_norm": 6.601440573023448,
"learning_rate": 1.7087228919919395e-05,
"loss": 0.2682,
"step": 10660
},
{
"epoch": 0.86,
"grad_norm": 8.06521205975687,
"learning_rate": 1.7074662978732713e-05,
"loss": 0.2672,
"step": 10680
},
{
"epoch": 0.86,
"grad_norm": 5.877886448612562,
"learning_rate": 1.7062074633236992e-05,
"loss": 0.2415,
"step": 10700
},
{
"epoch": 0.87,
"grad_norm": 6.00267509589556,
"learning_rate": 1.704946392330311e-05,
"loss": 0.245,
"step": 10720
},
{
"epoch": 0.87,
"grad_norm": 18.727472632503616,
"learning_rate": 1.703683088887278e-05,
"loss": 0.2527,
"step": 10740
},
{
"epoch": 0.87,
"grad_norm": 8.42578939933542,
"learning_rate": 1.7024175569958435e-05,
"loss": 0.2447,
"step": 10760
},
{
"epoch": 0.87,
"grad_norm": 15.871158165018187,
"learning_rate": 1.7011498006643075e-05,
"loss": 0.2611,
"step": 10780
},
{
"epoch": 0.87,
"grad_norm": 4.623538224443551,
"learning_rate": 1.6998798239080167e-05,
"loss": 0.2521,
"step": 10800
},
{
"epoch": 0.87,
"grad_norm": 6.908983060916792,
"learning_rate": 1.698607630749349e-05,
"loss": 0.2298,
"step": 10820
},
{
"epoch": 0.88,
"grad_norm": 6.502465294111384,
"learning_rate": 1.6973332252177036e-05,
"loss": 0.2498,
"step": 10840
},
{
"epoch": 0.88,
"grad_norm": 4.978479228853818,
"learning_rate": 1.6960566113494865e-05,
"loss": 0.252,
"step": 10860
},
{
"epoch": 0.88,
"grad_norm": 5.650381173298351,
"learning_rate": 1.694777793188098e-05,
"loss": 0.2288,
"step": 10880
},
{
"epoch": 0.88,
"grad_norm": 7.073746360539243,
"learning_rate": 1.6934967747839202e-05,
"loss": 0.2519,
"step": 10900
},
{
"epoch": 0.88,
"grad_norm": 5.927901369661737,
"learning_rate": 1.6922135601943037e-05,
"loss": 0.265,
"step": 10920
},
{
"epoch": 0.88,
"grad_norm": 5.53567758715019,
"learning_rate": 1.690928153483555e-05,
"loss": 0.25,
"step": 10940
},
{
"epoch": 0.88,
"grad_norm": 7.570944618942586,
"learning_rate": 1.6896405587229247e-05,
"loss": 0.2549,
"step": 10960
},
{
"epoch": 0.89,
"grad_norm": 7.379565103013804,
"learning_rate": 1.6883507799905922e-05,
"loss": 0.2363,
"step": 10980
},
{
"epoch": 0.89,
"grad_norm": 9.023229502472875,
"learning_rate": 1.6870588213716555e-05,
"loss": 0.2832,
"step": 11000
},
{
"epoch": 0.89,
"grad_norm": 5.6792260655491855,
"learning_rate": 1.6857646869581153e-05,
"loss": 0.228,
"step": 11020
},
{
"epoch": 0.89,
"grad_norm": 7.456793627942026,
"learning_rate": 1.6844683808488647e-05,
"loss": 0.2494,
"step": 11040
},
{
"epoch": 0.89,
"grad_norm": 4.8011477449229885,
"learning_rate": 1.6831699071496758e-05,
"loss": 0.2634,
"step": 11060
},
{
"epoch": 0.89,
"grad_norm": 6.58057290965885,
"learning_rate": 1.681869269973184e-05,
"loss": 0.2577,
"step": 11080
},
{
"epoch": 0.9,
"grad_norm": 5.68008811828603,
"learning_rate": 1.68056647343888e-05,
"loss": 0.2297,
"step": 11100
},
{
"epoch": 0.9,
"grad_norm": 6.528010244716758,
"learning_rate": 1.6792615216730907e-05,
"loss": 0.2196,
"step": 11120
},
{
"epoch": 0.9,
"grad_norm": 5.853566456861371,
"learning_rate": 1.6779544188089715e-05,
"loss": 0.2629,
"step": 11140
},
{
"epoch": 0.9,
"grad_norm": 10.986926893405414,
"learning_rate": 1.67664516898649e-05,
"loss": 0.2302,
"step": 11160
},
{
"epoch": 0.9,
"grad_norm": 7.730824034913035,
"learning_rate": 1.6753337763524137e-05,
"loss": 0.2336,
"step": 11180
},
{
"epoch": 0.9,
"grad_norm": 7.922173067463235,
"learning_rate": 1.6740202450602976e-05,
"loss": 0.2686,
"step": 11200
},
{
"epoch": 0.91,
"grad_norm": 5.406865255814246,
"learning_rate": 1.67270457927047e-05,
"loss": 0.226,
"step": 11220
},
{
"epoch": 0.91,
"grad_norm": 6.843481049848729,
"learning_rate": 1.6713867831500195e-05,
"loss": 0.2586,
"step": 11240
},
{
"epoch": 0.91,
"grad_norm": 5.49549924323287,
"learning_rate": 1.670066860872783e-05,
"loss": 0.2627,
"step": 11260
},
{
"epoch": 0.91,
"grad_norm": 6.183808429627808,
"learning_rate": 1.6687448166193306e-05,
"loss": 0.2749,
"step": 11280
},
{
"epoch": 0.91,
"grad_norm": 4.378810204329709,
"learning_rate": 1.667420654576954e-05,
"loss": 0.2558,
"step": 11300
},
{
"epoch": 0.91,
"grad_norm": 6.028002244995752,
"learning_rate": 1.666094378939652e-05,
"loss": 0.2554,
"step": 11320
},
{
"epoch": 0.92,
"grad_norm": 7.776788987779546,
"learning_rate": 1.664765993908118e-05,
"loss": 0.2326,
"step": 11340
},
{
"epoch": 0.92,
"grad_norm": 7.503277380435426,
"learning_rate": 1.663435503689726e-05,
"loss": 0.2707,
"step": 11360
},
{
"epoch": 0.92,
"grad_norm": 6.303861845235693,
"learning_rate": 1.6621029124985195e-05,
"loss": 0.2435,
"step": 11380
},
{
"epoch": 0.92,
"grad_norm": 7.213728574312154,
"learning_rate": 1.6607682245551935e-05,
"loss": 0.2514,
"step": 11400
},
{
"epoch": 0.92,
"grad_norm": 5.2552293437415525,
"learning_rate": 1.6594314440870864e-05,
"loss": 0.2397,
"step": 11420
},
{
"epoch": 0.92,
"grad_norm": 6.538249814157013,
"learning_rate": 1.6580925753281634e-05,
"loss": 0.2655,
"step": 11440
},
{
"epoch": 0.93,
"grad_norm": 5.2378821622768905,
"learning_rate": 1.6567516225190035e-05,
"loss": 0.2607,
"step": 11460
},
{
"epoch": 0.93,
"grad_norm": 5.674850314010563,
"learning_rate": 1.655408589906787e-05,
"loss": 0.2723,
"step": 11480
},
{
"epoch": 0.93,
"grad_norm": 7.192949169932349,
"learning_rate": 1.654063481745281e-05,
"loss": 0.2561,
"step": 11500
},
{
"epoch": 0.93,
"grad_norm": 13.135993930717675,
"learning_rate": 1.652716302294828e-05,
"loss": 0.2382,
"step": 11520
},
{
"epoch": 0.93,
"grad_norm": 4.887607996691356,
"learning_rate": 1.651367055822329e-05,
"loss": 0.2863,
"step": 11540
},
{
"epoch": 0.93,
"grad_norm": 7.367579978609729,
"learning_rate": 1.6500157466012324e-05,
"loss": 0.2379,
"step": 11560
},
{
"epoch": 0.93,
"grad_norm": 8.199270857981157,
"learning_rate": 1.6486623789115205e-05,
"loss": 0.2432,
"step": 11580
},
{
"epoch": 0.94,
"grad_norm": 6.243091274334211,
"learning_rate": 1.6473069570396942e-05,
"loss": 0.2635,
"step": 11600
},
{
"epoch": 0.94,
"grad_norm": 5.6352137765892545,
"learning_rate": 1.6459494852787622e-05,
"loss": 0.2292,
"step": 11620
},
{
"epoch": 0.94,
"grad_norm": 5.2104929401235305,
"learning_rate": 1.6445899679282248e-05,
"loss": 0.2545,
"step": 11640
},
{
"epoch": 0.94,
"grad_norm": 5.635847694521193,
"learning_rate": 1.6432284092940607e-05,
"loss": 0.247,
"step": 11660
},
{
"epoch": 0.94,
"grad_norm": 5.853851889115171,
"learning_rate": 1.6418648136887152e-05,
"loss": 0.2323,
"step": 11680
},
{
"epoch": 0.94,
"grad_norm": 4.98208977143132,
"learning_rate": 1.6404991854310846e-05,
"loss": 0.238,
"step": 11700
},
{
"epoch": 0.95,
"grad_norm": 5.560280174770714,
"learning_rate": 1.6391315288465027e-05,
"loss": 0.2589,
"step": 11720
},
{
"epoch": 0.95,
"grad_norm": 11.332988584174231,
"learning_rate": 1.637761848266729e-05,
"loss": 0.2437,
"step": 11740
},
{
"epoch": 0.95,
"grad_norm": 13.079688339953384,
"learning_rate": 1.6363901480299323e-05,
"loss": 0.2489,
"step": 11760
},
{
"epoch": 0.95,
"grad_norm": 6.852537601204953,
"learning_rate": 1.6350164324806787e-05,
"loss": 0.218,
"step": 11780
},
{
"epoch": 0.95,
"grad_norm": 6.384240727219325,
"learning_rate": 1.633640705969917e-05,
"loss": 0.2419,
"step": 11800
},
{
"epoch": 0.95,
"grad_norm": 4.348764283501352,
"learning_rate": 1.6322629728549665e-05,
"loss": 0.2037,
"step": 11820
},
{
"epoch": 0.96,
"grad_norm": 5.096264739138052,
"learning_rate": 1.6308832374994997e-05,
"loss": 0.2502,
"step": 11840
},
{
"epoch": 0.96,
"grad_norm": 4.471177088927129,
"learning_rate": 1.6295015042735336e-05,
"loss": 0.2435,
"step": 11860
},
{
"epoch": 0.96,
"grad_norm": 7.886308089698534,
"learning_rate": 1.6281177775534106e-05,
"loss": 0.2367,
"step": 11880
},
{
"epoch": 0.96,
"grad_norm": 5.0872043608074335,
"learning_rate": 1.6267320617217886e-05,
"loss": 0.2618,
"step": 11900
},
{
"epoch": 0.96,
"grad_norm": 7.332403239597943,
"learning_rate": 1.6253443611676247e-05,
"loss": 0.2377,
"step": 11920
},
{
"epoch": 0.96,
"grad_norm": 5.2156408493688,
"learning_rate": 1.6239546802861628e-05,
"loss": 0.2588,
"step": 11940
},
{
"epoch": 0.97,
"grad_norm": 14.389605988283588,
"learning_rate": 1.6225630234789186e-05,
"loss": 0.2359,
"step": 11960
},
{
"epoch": 0.97,
"grad_norm": 6.61108607154756,
"learning_rate": 1.621169395153666e-05,
"loss": 0.2454,
"step": 11980
},
{
"epoch": 0.97,
"grad_norm": 5.92623925749379,
"learning_rate": 1.6197737997244242e-05,
"loss": 0.2504,
"step": 12000
},
{
"epoch": 0.97,
"grad_norm": 6.729876438497323,
"learning_rate": 1.6183762416114417e-05,
"loss": 0.231,
"step": 12020
},
{
"epoch": 0.97,
"grad_norm": 4.91119912664639,
"learning_rate": 1.6169767252411843e-05,
"loss": 0.2732,
"step": 12040
},
{
"epoch": 0.97,
"grad_norm": 7.372474108547359,
"learning_rate": 1.615575255046319e-05,
"loss": 0.2396,
"step": 12060
},
{
"epoch": 0.98,
"grad_norm": 4.844310112839635,
"learning_rate": 1.6141718354657023e-05,
"loss": 0.2682,
"step": 12080
},
{
"epoch": 0.98,
"grad_norm": 7.827541428550464,
"learning_rate": 1.6127664709443642e-05,
"loss": 0.2351,
"step": 12100
},
{
"epoch": 0.98,
"grad_norm": 6.394194783450918,
"learning_rate": 1.6113591659334952e-05,
"loss": 0.277,
"step": 12120
},
{
"epoch": 0.98,
"grad_norm": 6.728544539125102,
"learning_rate": 1.609949924890432e-05,
"loss": 0.2517,
"step": 12140
},
{
"epoch": 0.98,
"grad_norm": 4.095514979882195,
"learning_rate": 1.6085387522786432e-05,
"loss": 0.2317,
"step": 12160
},
{
"epoch": 0.98,
"grad_norm": 6.899190893971197,
"learning_rate": 1.6071256525677144e-05,
"loss": 0.239,
"step": 12180
},
{
"epoch": 0.98,
"grad_norm": 5.002813882583922,
"learning_rate": 1.6057106302333366e-05,
"loss": 0.2411,
"step": 12200
},
{
"epoch": 0.99,
"grad_norm": 6.7562128367712,
"learning_rate": 1.6042936897572883e-05,
"loss": 0.2347,
"step": 12220
},
{
"epoch": 0.99,
"grad_norm": 9.896004658604653,
"learning_rate": 1.6028748356274247e-05,
"loss": 0.2526,
"step": 12240
},
{
"epoch": 0.99,
"grad_norm": 7.972800268940516,
"learning_rate": 1.6014540723376623e-05,
"loss": 0.2505,
"step": 12260
},
{
"epoch": 0.99,
"grad_norm": 5.170343546862058,
"learning_rate": 1.600031404387963e-05,
"loss": 0.2478,
"step": 12280
},
{
"epoch": 0.99,
"grad_norm": 6.356344714814083,
"learning_rate": 1.5986068362843224e-05,
"loss": 0.2767,
"step": 12300
},
{
"epoch": 0.99,
"grad_norm": 6.20794198597022,
"learning_rate": 1.5971803725387544e-05,
"loss": 0.2533,
"step": 12320
},
{
"epoch": 1.0,
"grad_norm": 7.368279449995274,
"learning_rate": 1.5957520176692766e-05,
"loss": 0.2706,
"step": 12340
},
{
"epoch": 1.0,
"grad_norm": 9.218421438795374,
"learning_rate": 1.594321776199896e-05,
"loss": 0.2447,
"step": 12360
},
{
"epoch": 1.0,
"grad_norm": 5.4653346268657845,
"learning_rate": 1.592889652660596e-05,
"loss": 0.2339,
"step": 12380
},
{
"epoch": 1.0,
"grad_norm": 6.741041667370887,
"learning_rate": 1.5914556515873197e-05,
"loss": 0.1749,
"step": 12400
},
{
"epoch": 1.0,
"grad_norm": 4.207049838195936,
"learning_rate": 1.590019777521959e-05,
"loss": 0.1849,
"step": 12420
},
{
"epoch": 1.0,
"grad_norm": 7.1496607666636285,
"learning_rate": 1.588582035012336e-05,
"loss": 0.1743,
"step": 12440
},
{
"epoch": 1.01,
"grad_norm": 7.5265979882421865,
"learning_rate": 1.587142428612191e-05,
"loss": 0.1868,
"step": 12460
},
{
"epoch": 1.01,
"grad_norm": 5.651063343012383,
"learning_rate": 1.5857009628811692e-05,
"loss": 0.1983,
"step": 12480
},
{
"epoch": 1.01,
"grad_norm": 9.202976607727676,
"learning_rate": 1.5842576423848034e-05,
"loss": 0.1917,
"step": 12500
},
{
"epoch": 1.01,
"grad_norm": 5.832342590483985,
"learning_rate": 1.582812471694501e-05,
"loss": 0.2189,
"step": 12520
},
{
"epoch": 1.01,
"grad_norm": 6.20925991986496,
"learning_rate": 1.5813654553875307e-05,
"loss": 0.1941,
"step": 12540
},
{
"epoch": 1.01,
"grad_norm": 6.9734995441552865,
"learning_rate": 1.579916598047006e-05,
"loss": 0.1722,
"step": 12560
},
{
"epoch": 1.02,
"grad_norm": 5.261181866981142,
"learning_rate": 1.578465904261871e-05,
"loss": 0.1841,
"step": 12580
},
{
"epoch": 1.02,
"grad_norm": 6.347552583288099,
"learning_rate": 1.5770133786268867e-05,
"loss": 0.2178,
"step": 12600
},
{
"epoch": 1.02,
"grad_norm": 5.354096329322261,
"learning_rate": 1.5755590257426172e-05,
"loss": 0.2037,
"step": 12620
},
{
"epoch": 1.02,
"grad_norm": 6.433760955249804,
"learning_rate": 1.5741028502154122e-05,
"loss": 0.1918,
"step": 12640
},
{
"epoch": 1.02,
"grad_norm": 10.724559043942634,
"learning_rate": 1.572644856657396e-05,
"loss": 0.1943,
"step": 12660
},
{
"epoch": 1.02,
"grad_norm": 7.14880036647321,
"learning_rate": 1.571185049686449e-05,
"loss": 0.1971,
"step": 12680
},
{
"epoch": 1.03,
"grad_norm": 5.58282494958642,
"learning_rate": 1.5697234339261973e-05,
"loss": 0.2066,
"step": 12700
},
{
"epoch": 1.03,
"grad_norm": 5.049152250234184,
"learning_rate": 1.5682600140059945e-05,
"loss": 0.2155,
"step": 12720
},
{
"epoch": 1.03,
"grad_norm": 5.577269074409907,
"learning_rate": 1.5667947945609098e-05,
"loss": 0.2307,
"step": 12740
},
{
"epoch": 1.03,
"grad_norm": 7.3636602086666905,
"learning_rate": 1.5653277802317107e-05,
"loss": 0.1904,
"step": 12760
},
{
"epoch": 1.03,
"grad_norm": 5.824392969812123,
"learning_rate": 1.5638589756648507e-05,
"loss": 0.1796,
"step": 12780
},
{
"epoch": 1.03,
"grad_norm": 6.363241683808851,
"learning_rate": 1.562388385512452e-05,
"loss": 0.1792,
"step": 12800
},
{
"epoch": 1.03,
"grad_norm": 8.101137773642606,
"learning_rate": 1.560916014432294e-05,
"loss": 0.1934,
"step": 12820
},
{
"epoch": 1.04,
"grad_norm": 4.945106731069112,
"learning_rate": 1.559441867087796e-05,
"loss": 0.2209,
"step": 12840
},
{
"epoch": 1.04,
"grad_norm": 6.2605136180443495,
"learning_rate": 1.5579659481480026e-05,
"loss": 0.1781,
"step": 12860
},
{
"epoch": 1.04,
"grad_norm": 7.849809107312115,
"learning_rate": 1.5564882622875715e-05,
"loss": 0.1772,
"step": 12880
},
{
"epoch": 1.04,
"grad_norm": 6.076234028129562,
"learning_rate": 1.5550088141867542e-05,
"loss": 0.1798,
"step": 12900
},
{
"epoch": 1.04,
"grad_norm": 8.417089571258343,
"learning_rate": 1.553527608531386e-05,
"loss": 0.2224,
"step": 12920
},
{
"epoch": 1.04,
"grad_norm": 5.434386315534151,
"learning_rate": 1.5520446500128666e-05,
"loss": 0.1751,
"step": 12940
},
{
"epoch": 1.05,
"grad_norm": 7.365658808789612,
"learning_rate": 1.55055994332815e-05,
"loss": 0.216,
"step": 12960
},
{
"epoch": 1.05,
"grad_norm": 6.124958583146801,
"learning_rate": 1.5490734931797252e-05,
"loss": 0.1785,
"step": 12980
},
{
"epoch": 1.05,
"grad_norm": 6.345434106235919,
"learning_rate": 1.5475853042756045e-05,
"loss": 0.2129,
"step": 13000
},
{
"epoch": 1.05,
"grad_norm": 10.0344020371502,
"learning_rate": 1.5460953813293065e-05,
"loss": 0.178,
"step": 13020
},
{
"epoch": 1.05,
"grad_norm": 7.7502114051780016,
"learning_rate": 1.544603729059842e-05,
"loss": 0.1777,
"step": 13040
},
{
"epoch": 1.05,
"grad_norm": 6.198622753624231,
"learning_rate": 1.5431103521916996e-05,
"loss": 0.2098,
"step": 13060
},
{
"epoch": 1.06,
"grad_norm": 6.48127602670386,
"learning_rate": 1.5416152554548302e-05,
"loss": 0.164,
"step": 13080
},
{
"epoch": 1.06,
"grad_norm": 7.600239308253696,
"learning_rate": 1.5401184435846316e-05,
"loss": 0.1847,
"step": 13100
},
{
"epoch": 1.06,
"grad_norm": 6.423081036482468,
"learning_rate": 1.5386199213219344e-05,
"loss": 0.1873,
"step": 13120
},
{
"epoch": 1.06,
"grad_norm": 7.1989659944439355,
"learning_rate": 1.5371196934129854e-05,
"loss": 0.2092,
"step": 13140
},
{
"epoch": 1.06,
"grad_norm": 4.613997491830078,
"learning_rate": 1.5356177646094348e-05,
"loss": 0.1882,
"step": 13160
},
{
"epoch": 1.06,
"grad_norm": 5.629794641682726,
"learning_rate": 1.5341141396683202e-05,
"loss": 0.1952,
"step": 13180
},
{
"epoch": 1.07,
"grad_norm": 5.86222977330632,
"learning_rate": 1.53260882335205e-05,
"loss": 0.1857,
"step": 13200
},
{
"epoch": 1.07,
"grad_norm": 5.390116349700223,
"learning_rate": 1.5311018204283915e-05,
"loss": 0.1862,
"step": 13220
},
{
"epoch": 1.07,
"grad_norm": 4.734598991710353,
"learning_rate": 1.5295931356704522e-05,
"loss": 0.1922,
"step": 13240
},
{
"epoch": 1.07,
"grad_norm": 6.44238392273467,
"learning_rate": 1.5280827738566673e-05,
"loss": 0.1823,
"step": 13260
},
{
"epoch": 1.07,
"grad_norm": 4.314282919486737,
"learning_rate": 1.5265707397707838e-05,
"loss": 0.1904,
"step": 13280
},
{
"epoch": 1.07,
"grad_norm": 6.471785123561109,
"learning_rate": 1.525057038201845e-05,
"loss": 0.2201,
"step": 13300
},
{
"epoch": 1.08,
"grad_norm": 6.211228619356565,
"learning_rate": 1.523541673944176e-05,
"loss": 0.1941,
"step": 13320
},
{
"epoch": 1.08,
"grad_norm": 5.109706482939786,
"learning_rate": 1.5220246517973674e-05,
"loss": 0.205,
"step": 13340
},
{
"epoch": 1.08,
"grad_norm": 7.1474883569847405,
"learning_rate": 1.5205059765662611e-05,
"loss": 0.1863,
"step": 13360
},
{
"epoch": 1.08,
"grad_norm": 10.992853444090926,
"learning_rate": 1.5189856530609351e-05,
"loss": 0.2029,
"step": 13380
},
{
"epoch": 1.08,
"grad_norm": 5.481913913723081,
"learning_rate": 1.517463686096688e-05,
"loss": 0.2004,
"step": 13400
},
{
"epoch": 1.08,
"grad_norm": 5.850124859903834,
"learning_rate": 1.5159400804940232e-05,
"loss": 0.2029,
"step": 13420
},
{
"epoch": 1.08,
"grad_norm": 5.113867042039441,
"learning_rate": 1.5144148410786344e-05,
"loss": 0.2166,
"step": 13440
},
{
"epoch": 1.09,
"grad_norm": 6.039714348714059,
"learning_rate": 1.51288797268139e-05,
"loss": 0.2116,
"step": 13460
},
{
"epoch": 1.09,
"grad_norm": 7.541603158363756,
"learning_rate": 1.5113594801383178e-05,
"loss": 0.1925,
"step": 13480
},
{
"epoch": 1.09,
"grad_norm": 7.479663488804942,
"learning_rate": 1.50982936829059e-05,
"loss": 0.1953,
"step": 13500
},
{
"epoch": 1.09,
"grad_norm": 6.7557856877486175,
"learning_rate": 1.5082976419845078e-05,
"loss": 0.1976,
"step": 13520
},
{
"epoch": 1.09,
"grad_norm": 5.931350253738143,
"learning_rate": 1.5067643060714844e-05,
"loss": 0.2133,
"step": 13540
},
{
"epoch": 1.09,
"grad_norm": 6.494971457661676,
"learning_rate": 1.5052293654080332e-05,
"loss": 0.176,
"step": 13560
},
{
"epoch": 1.1,
"grad_norm": 5.762783270305054,
"learning_rate": 1.503692824855749e-05,
"loss": 0.2096,
"step": 13580
},
{
"epoch": 1.1,
"grad_norm": 6.848342260542276,
"learning_rate": 1.5021546892812934e-05,
"loss": 0.2034,
"step": 13600
},
{
"epoch": 1.1,
"grad_norm": 5.6448287727059485,
"learning_rate": 1.5006149635563817e-05,
"loss": 0.1936,
"step": 13620
},
{
"epoch": 1.1,
"grad_norm": 5.83680549100651,
"learning_rate": 1.4990736525577642e-05,
"loss": 0.2025,
"step": 13640
},
{
"epoch": 1.1,
"grad_norm": 7.41015135946399,
"learning_rate": 1.4975307611672127e-05,
"loss": 0.2024,
"step": 13660
},
{
"epoch": 1.1,
"grad_norm": 9.778277740797297,
"learning_rate": 1.4959862942715043e-05,
"loss": 0.1707,
"step": 13680
},
{
"epoch": 1.11,
"grad_norm": 5.046412396561728,
"learning_rate": 1.4944402567624065e-05,
"loss": 0.1936,
"step": 13700
},
{
"epoch": 1.11,
"grad_norm": 11.086870666327583,
"learning_rate": 1.492892653536661e-05,
"loss": 0.1979,
"step": 13720
},
{
"epoch": 1.11,
"grad_norm": 6.0254596329111525,
"learning_rate": 1.4913434894959693e-05,
"loss": 0.1791,
"step": 13740
},
{
"epoch": 1.11,
"grad_norm": 7.033701558289489,
"learning_rate": 1.4897927695469756e-05,
"loss": 0.1905,
"step": 13760
},
{
"epoch": 1.11,
"grad_norm": 4.474543730422018,
"learning_rate": 1.4882404986012523e-05,
"loss": 0.1693,
"step": 13780
},
{
"epoch": 1.11,
"grad_norm": 4.1690691824315405,
"learning_rate": 1.4866866815752847e-05,
"loss": 0.1856,
"step": 13800
},
{
"epoch": 1.12,
"grad_norm": 5.6756955564977964,
"learning_rate": 1.4851313233904547e-05,
"loss": 0.2053,
"step": 13820
},
{
"epoch": 1.12,
"grad_norm": 11.164112387266075,
"learning_rate": 1.4835744289730252e-05,
"loss": 0.171,
"step": 13840
},
{
"epoch": 1.12,
"grad_norm": 7.308139120179797,
"learning_rate": 1.4820160032541254e-05,
"loss": 0.1954,
"step": 13860
},
{
"epoch": 1.12,
"grad_norm": 3.5914657630993294,
"learning_rate": 1.4804560511697341e-05,
"loss": 0.2246,
"step": 13880
},
{
"epoch": 1.12,
"grad_norm": 6.751209857032397,
"learning_rate": 1.4788945776606647e-05,
"loss": 0.2013,
"step": 13900
},
{
"epoch": 1.12,
"grad_norm": 6.405950176387068,
"learning_rate": 1.477331587672549e-05,
"loss": 0.2113,
"step": 13920
},
{
"epoch": 1.13,
"grad_norm": 6.376328572976509,
"learning_rate": 1.4757670861558228e-05,
"loss": 0.1924,
"step": 13940
},
{
"epoch": 1.13,
"grad_norm": 9.121068656282398,
"learning_rate": 1.4742010780657085e-05,
"loss": 0.209,
"step": 13960
},
{
"epoch": 1.13,
"grad_norm": 4.7626419486771026,
"learning_rate": 1.4726335683622008e-05,
"loss": 0.2255,
"step": 13980
},
{
"epoch": 1.13,
"grad_norm": 6.817788687267193,
"learning_rate": 1.4710645620100499e-05,
"loss": 0.1896,
"step": 14000
},
{
"epoch": 1.13,
"grad_norm": 7.332059084839946,
"learning_rate": 1.4694940639787466e-05,
"loss": 0.2066,
"step": 14020
},
{
"epoch": 1.13,
"grad_norm": 5.640954681652653,
"learning_rate": 1.4679220792425067e-05,
"loss": 0.1771,
"step": 14040
},
{
"epoch": 1.13,
"grad_norm": 6.095651183293601,
"learning_rate": 1.4663486127802538e-05,
"loss": 0.186,
"step": 14060
},
{
"epoch": 1.14,
"grad_norm": 7.092095404760473,
"learning_rate": 1.464773669575606e-05,
"loss": 0.2142,
"step": 14080
},
{
"epoch": 1.14,
"grad_norm": 4.837369592766958,
"learning_rate": 1.463197254616857e-05,
"loss": 0.2218,
"step": 14100
},
{
"epoch": 1.14,
"grad_norm": 4.9264497356291646,
"learning_rate": 1.4616193728969633e-05,
"loss": 0.1938,
"step": 14120
},
{
"epoch": 1.14,
"grad_norm": 9.674187157366852,
"learning_rate": 1.4600400294135264e-05,
"loss": 0.2098,
"step": 14140
},
{
"epoch": 1.14,
"grad_norm": 6.437523999215732,
"learning_rate": 1.4584592291687777e-05,
"loss": 0.2029,
"step": 14160
},
{
"epoch": 1.14,
"grad_norm": 6.823030504825212,
"learning_rate": 1.4568769771695625e-05,
"loss": 0.1877,
"step": 14180
},
{
"epoch": 1.15,
"grad_norm": 7.082752727573593,
"learning_rate": 1.4552932784273246e-05,
"loss": 0.1928,
"step": 14200
},
{
"epoch": 1.15,
"grad_norm": 11.029950376037581,
"learning_rate": 1.45370813795809e-05,
"loss": 0.1682,
"step": 14220
},
{
"epoch": 1.15,
"grad_norm": 4.956260098113924,
"learning_rate": 1.4521215607824499e-05,
"loss": 0.1972,
"step": 14240
},
{
"epoch": 1.15,
"grad_norm": 5.51023882727362,
"learning_rate": 1.4505335519255482e-05,
"loss": 0.1967,
"step": 14260
},
{
"epoch": 1.15,
"grad_norm": 6.160472680324094,
"learning_rate": 1.4489441164170612e-05,
"loss": 0.1913,
"step": 14280
},
{
"epoch": 1.15,
"grad_norm": 9.72267194404117,
"learning_rate": 1.447353259291185e-05,
"loss": 0.1818,
"step": 14300
},
{
"epoch": 1.16,
"grad_norm": 8.889058328709032,
"learning_rate": 1.4457609855866181e-05,
"loss": 0.2082,
"step": 14320
},
{
"epoch": 1.16,
"grad_norm": 3.973002280859937,
"learning_rate": 1.4441673003465458e-05,
"loss": 0.1851,
"step": 14340
},
{
"epoch": 1.16,
"grad_norm": 6.797652557445699,
"learning_rate": 1.4425722086186236e-05,
"loss": 0.191,
"step": 14360
},
{
"epoch": 1.16,
"grad_norm": 7.229624826239175,
"learning_rate": 1.4409757154549621e-05,
"loss": 0.1891,
"step": 14380
},
{
"epoch": 1.16,
"grad_norm": 6.741986922621174,
"learning_rate": 1.4393778259121113e-05,
"loss": 0.1868,
"step": 14400
},
{
"epoch": 1.16,
"grad_norm": 4.781423328723562,
"learning_rate": 1.4377785450510426e-05,
"loss": 0.1953,
"step": 14420
},
{
"epoch": 1.17,
"grad_norm": 5.074661529228375,
"learning_rate": 1.436177877937135e-05,
"loss": 0.2004,
"step": 14440
},
{
"epoch": 1.17,
"grad_norm": 8.033180093952518,
"learning_rate": 1.4345758296401585e-05,
"loss": 0.1816,
"step": 14460
},
{
"epoch": 1.17,
"grad_norm": 5.974169762042515,
"learning_rate": 1.4329724052342569e-05,
"loss": 0.192,
"step": 14480
},
{
"epoch": 1.17,
"grad_norm": 4.656455835490763,
"learning_rate": 1.4313676097979326e-05,
"loss": 0.1835,
"step": 14500
},
{
"epoch": 1.17,
"grad_norm": 3.1936684555124617,
"learning_rate": 1.4297614484140307e-05,
"loss": 0.1808,
"step": 14520
},
{
"epoch": 1.17,
"grad_norm": 8.20373974631054,
"learning_rate": 1.4281539261697228e-05,
"loss": 0.1836,
"step": 14540
},
{
"epoch": 1.18,
"grad_norm": 6.186778941670351,
"learning_rate": 1.4265450481564904e-05,
"loss": 0.1946,
"step": 14560
},
{
"epoch": 1.18,
"grad_norm": 6.498458092545216,
"learning_rate": 1.4249348194701091e-05,
"loss": 0.1883,
"step": 14580
},
{
"epoch": 1.18,
"grad_norm": 4.671798654398146,
"learning_rate": 1.4233232452106331e-05,
"loss": 0.1981,
"step": 14600
},
{
"epoch": 1.18,
"grad_norm": 6.939397878773192,
"learning_rate": 1.4217103304823774e-05,
"loss": 0.1858,
"step": 14620
},
{
"epoch": 1.18,
"grad_norm": 9.00976329632259,
"learning_rate": 1.4200960803939034e-05,
"loss": 0.1917,
"step": 14640
},
{
"epoch": 1.18,
"grad_norm": 6.559576346045498,
"learning_rate": 1.4184805000580018e-05,
"loss": 0.1915,
"step": 14660
},
{
"epoch": 1.18,
"grad_norm": 5.306030822468831,
"learning_rate": 1.4168635945916762e-05,
"loss": 0.2023,
"step": 14680
},
{
"epoch": 1.19,
"grad_norm": 6.917396077987428,
"learning_rate": 1.4152453691161279e-05,
"loss": 0.201,
"step": 14700
},
{
"epoch": 1.19,
"grad_norm": 5.388833984060343,
"learning_rate": 1.4136258287567386e-05,
"loss": 0.1951,
"step": 14720
},
{
"epoch": 1.19,
"grad_norm": 5.741268992759075,
"learning_rate": 1.412004978643055e-05,
"loss": 0.2052,
"step": 14740
},
{
"epoch": 1.19,
"grad_norm": 8.528637831977798,
"learning_rate": 1.4103828239087713e-05,
"loss": 0.1911,
"step": 14760
},
{
"epoch": 1.19,
"grad_norm": 7.229831133673562,
"learning_rate": 1.4087593696917152e-05,
"loss": 0.2147,
"step": 14780
},
{
"epoch": 1.19,
"grad_norm": 8.453977418463722,
"learning_rate": 1.4071346211338287e-05,
"loss": 0.2056,
"step": 14800
},
{
"epoch": 1.2,
"grad_norm": 4.0493875016839205,
"learning_rate": 1.4055085833811543e-05,
"loss": 0.1875,
"step": 14820
},
{
"epoch": 1.2,
"grad_norm": 6.315957510536494,
"learning_rate": 1.403881261583818e-05,
"loss": 0.2049,
"step": 14840
},
{
"epoch": 1.2,
"grad_norm": 6.174807081413619,
"learning_rate": 1.4022526608960117e-05,
"loss": 0.1887,
"step": 14860
},
{
"epoch": 1.2,
"grad_norm": 5.128483960445226,
"learning_rate": 1.4006227864759787e-05,
"loss": 0.1958,
"step": 14880
},
{
"epoch": 1.2,
"grad_norm": 6.592332443520915,
"learning_rate": 1.3989916434859961e-05,
"loss": 0.174,
"step": 14900
},
{
"epoch": 1.2,
"grad_norm": 5.47341406161348,
"learning_rate": 1.3973592370923594e-05,
"loss": 0.1972,
"step": 14920
},
{
"epoch": 1.21,
"grad_norm": 10.100193860299168,
"learning_rate": 1.395725572465366e-05,
"loss": 0.217,
"step": 14940
},
{
"epoch": 1.21,
"grad_norm": 5.393687564139285,
"learning_rate": 1.394090654779297e-05,
"loss": 0.1746,
"step": 14960
},
{
"epoch": 1.21,
"grad_norm": 4.862848248796205,
"learning_rate": 1.3924544892124037e-05,
"loss": 0.1804,
"step": 14980
},
{
"epoch": 1.21,
"grad_norm": 5.347965295637652,
"learning_rate": 1.390817080946889e-05,
"loss": 0.1774,
"step": 15000
},
{
"epoch": 1.21,
"grad_norm": 6.040766822188126,
"learning_rate": 1.3891784351688921e-05,
"loss": 0.2123,
"step": 15020
},
{
"epoch": 1.21,
"grad_norm": 5.8662001265351105,
"learning_rate": 1.3875385570684725e-05,
"loss": 0.1888,
"step": 15040
},
{
"epoch": 1.22,
"grad_norm": 3.4039196477297584,
"learning_rate": 1.3858974518395912e-05,
"loss": 0.1776,
"step": 15060
},
{
"epoch": 1.22,
"grad_norm": 5.345588055655486,
"learning_rate": 1.384255124680097e-05,
"loss": 0.1934,
"step": 15080
},
{
"epoch": 1.22,
"grad_norm": 7.7672522699283455,
"learning_rate": 1.3826115807917088e-05,
"loss": 0.1881,
"step": 15100
},
{
"epoch": 1.22,
"grad_norm": 14.607170518002183,
"learning_rate": 1.3809668253799989e-05,
"loss": 0.1992,
"step": 15120
},
{
"epoch": 1.22,
"grad_norm": 7.122439543446391,
"learning_rate": 1.379320863654377e-05,
"loss": 0.2071,
"step": 15140
},
{
"epoch": 1.22,
"grad_norm": 7.078892142047386,
"learning_rate": 1.3776737008280734e-05,
"loss": 0.1846,
"step": 15160
},
{
"epoch": 1.23,
"grad_norm": 4.833315926005269,
"learning_rate": 1.3760253421181232e-05,
"loss": 0.1955,
"step": 15180
},
{
"epoch": 1.23,
"grad_norm": 7.642880346774846,
"learning_rate": 1.3743757927453485e-05,
"loss": 0.1926,
"step": 15200
},
{
"epoch": 1.23,
"grad_norm": 5.4079945125552,
"learning_rate": 1.3727250579343427e-05,
"loss": 0.1873,
"step": 15220
},
{
"epoch": 1.23,
"grad_norm": 5.948430330328783,
"learning_rate": 1.371073142913454e-05,
"loss": 0.207,
"step": 15240
},
{
"epoch": 1.23,
"grad_norm": 5.5464350136347615,
"learning_rate": 1.369420052914769e-05,
"loss": 0.2059,
"step": 15260
},
{
"epoch": 1.23,
"grad_norm": 6.914142224558012,
"learning_rate": 1.3677657931740953e-05,
"loss": 0.2101,
"step": 15280
},
{
"epoch": 1.24,
"grad_norm": 9.57042012056836,
"learning_rate": 1.3661103689309451e-05,
"loss": 0.1845,
"step": 15300
},
{
"epoch": 1.24,
"grad_norm": 4.631446263817481,
"learning_rate": 1.3644537854285198e-05,
"loss": 0.1676,
"step": 15320
},
{
"epoch": 1.24,
"grad_norm": 7.563813907200117,
"learning_rate": 1.3627960479136917e-05,
"loss": 0.1959,
"step": 15340
},
{
"epoch": 1.24,
"grad_norm": 7.423882592374514,
"learning_rate": 1.3611371616369888e-05,
"loss": 0.2119,
"step": 15360
},
{
"epoch": 1.24,
"grad_norm": 8.033278865814227,
"learning_rate": 1.3594771318525772e-05,
"loss": 0.1999,
"step": 15380
},
{
"epoch": 1.24,
"grad_norm": 4.263461392179193,
"learning_rate": 1.3578159638182443e-05,
"loss": 0.1623,
"step": 15400
},
{
"epoch": 1.24,
"grad_norm": 5.605534420891448,
"learning_rate": 1.3561536627953846e-05,
"loss": 0.1878,
"step": 15420
},
{
"epoch": 1.25,
"grad_norm": 7.271725167246008,
"learning_rate": 1.3544902340489788e-05,
"loss": 0.203,
"step": 15440
},
{
"epoch": 1.25,
"grad_norm": 24.17397448663477,
"learning_rate": 1.3528256828475806e-05,
"loss": 0.1996,
"step": 15460
},
{
"epoch": 1.25,
"grad_norm": 8.21922387907255,
"learning_rate": 1.3511600144632984e-05,
"loss": 0.2115,
"step": 15480
},
{
"epoch": 1.25,
"grad_norm": 6.438100546073929,
"learning_rate": 1.3494932341717795e-05,
"loss": 0.2178,
"step": 15500
},
{
"epoch": 1.25,
"grad_norm": 5.438714078216004,
"learning_rate": 1.3478253472521926e-05,
"loss": 0.2035,
"step": 15520
},
{
"epoch": 1.25,
"grad_norm": 5.267123317638661,
"learning_rate": 1.3461563589872115e-05,
"loss": 0.1871,
"step": 15540
},
{
"epoch": 1.26,
"grad_norm": 6.952296244061062,
"learning_rate": 1.3444862746629983e-05,
"loss": 0.1796,
"step": 15560
},
{
"epoch": 1.26,
"grad_norm": 5.566630169564181,
"learning_rate": 1.3428150995691864e-05,
"loss": 0.19,
"step": 15580
},
{
"epoch": 1.26,
"grad_norm": 8.317946865138628,
"learning_rate": 1.3411428389988643e-05,
"loss": 0.1867,
"step": 15600
},
{
"epoch": 1.26,
"grad_norm": 5.8312505364290015,
"learning_rate": 1.3394694982485588e-05,
"loss": 0.1966,
"step": 15620
},
{
"epoch": 1.26,
"grad_norm": 5.558038014921936,
"learning_rate": 1.3377950826182167e-05,
"loss": 0.2084,
"step": 15640
},
{
"epoch": 1.26,
"grad_norm": 3.3670931844933434,
"learning_rate": 1.3361195974111908e-05,
"loss": 0.1886,
"step": 15660
},
{
"epoch": 1.27,
"grad_norm": 6.730872285494118,
"learning_rate": 1.3344430479342205e-05,
"loss": 0.1991,
"step": 15680
},
{
"epoch": 1.27,
"grad_norm": 5.221936012884134,
"learning_rate": 1.3327654394974164e-05,
"loss": 0.1871,
"step": 15700
},
{
"epoch": 1.27,
"grad_norm": 8.096789875414892,
"learning_rate": 1.3310867774142433e-05,
"loss": 0.1799,
"step": 15720
},
{
"epoch": 1.27,
"grad_norm": 4.9068121132509335,
"learning_rate": 1.3294070670015026e-05,
"loss": 0.1817,
"step": 15740
},
{
"epoch": 1.27,
"grad_norm": 3.9856004214295386,
"learning_rate": 1.3277263135793167e-05,
"loss": 0.1793,
"step": 15760
},
{
"epoch": 1.27,
"grad_norm": 7.118631561874846,
"learning_rate": 1.3260445224711115e-05,
"loss": 0.1787,
"step": 15780
},
{
"epoch": 1.28,
"grad_norm": 7.914428581966782,
"learning_rate": 1.3243616990035988e-05,
"loss": 0.1821,
"step": 15800
},
{
"epoch": 1.28,
"grad_norm": 6.602254147384827,
"learning_rate": 1.322677848506761e-05,
"loss": 0.1746,
"step": 15820
},
{
"epoch": 1.28,
"grad_norm": 6.142691150589716,
"learning_rate": 1.3209929763138333e-05,
"loss": 0.1964,
"step": 15840
},
{
"epoch": 1.28,
"grad_norm": 7.318010910855917,
"learning_rate": 1.3193070877612863e-05,
"loss": 0.1974,
"step": 15860
},
{
"epoch": 1.28,
"grad_norm": 6.924430013948181,
"learning_rate": 1.3176201881888104e-05,
"loss": 0.1991,
"step": 15880
},
{
"epoch": 1.28,
"grad_norm": 7.883879242504948,
"learning_rate": 1.3159322829392978e-05,
"loss": 0.1924,
"step": 15900
},
{
"epoch": 1.29,
"grad_norm": 5.220112148848683,
"learning_rate": 1.3142433773588259e-05,
"loss": 0.2138,
"step": 15920
},
{
"epoch": 1.29,
"grad_norm": 5.620528075363317,
"learning_rate": 1.3125534767966406e-05,
"loss": 0.1833,
"step": 15940
},
{
"epoch": 1.29,
"grad_norm": 7.394718512387162,
"learning_rate": 1.3108625866051393e-05,
"loss": 0.1745,
"step": 15960
},
{
"epoch": 1.29,
"grad_norm": 4.8000424144778275,
"learning_rate": 1.3091707121398535e-05,
"loss": 0.2024,
"step": 15980
},
{
"epoch": 1.29,
"grad_norm": 4.774826621857471,
"learning_rate": 1.3074778587594328e-05,
"loss": 0.2015,
"step": 16000
},
{
"epoch": 1.29,
"grad_norm": 9.387391712768,
"learning_rate": 1.3057840318256265e-05,
"loss": 0.1795,
"step": 16020
},
{
"epoch": 1.29,
"grad_norm": 4.963107377940831,
"learning_rate": 1.3040892367032682e-05,
"loss": 0.1653,
"step": 16040
},
{
"epoch": 1.3,
"grad_norm": 6.997759324175277,
"learning_rate": 1.3023934787602572e-05,
"loss": 0.2063,
"step": 16060
},
{
"epoch": 1.3,
"grad_norm": 6.190583972900869,
"learning_rate": 1.3006967633675432e-05,
"loss": 0.2153,
"step": 16080
},
{
"epoch": 1.3,
"grad_norm": 6.824679624981862,
"learning_rate": 1.2989990958991077e-05,
"loss": 0.1891,
"step": 16100
},
{
"epoch": 1.3,
"grad_norm": 8.754013884741005,
"learning_rate": 1.2973004817319479e-05,
"loss": 0.1804,
"step": 16120
},
{
"epoch": 1.3,
"grad_norm": 8.960491352232728,
"learning_rate": 1.29560092624606e-05,
"loss": 0.1923,
"step": 16140
},
{
"epoch": 1.3,
"grad_norm": 4.719281501394497,
"learning_rate": 1.2939004348244207e-05,
"loss": 0.2186,
"step": 16160
},
{
"epoch": 1.31,
"grad_norm": 6.0936889645380825,
"learning_rate": 1.2921990128529713e-05,
"loss": 0.2008,
"step": 16180
},
{
"epoch": 1.31,
"grad_norm": 6.559249218102312,
"learning_rate": 1.2904966657206013e-05,
"loss": 0.1968,
"step": 16200
},
{
"epoch": 1.31,
"grad_norm": 7.485638410293939,
"learning_rate": 1.2887933988191297e-05,
"loss": 0.1754,
"step": 16220
},
{
"epoch": 1.31,
"grad_norm": 4.949399299007962,
"learning_rate": 1.2870892175432887e-05,
"loss": 0.1949,
"step": 16240
},
{
"epoch": 1.31,
"grad_norm": 5.031890259751463,
"learning_rate": 1.2853841272907068e-05,
"loss": 0.1697,
"step": 16260
},
{
"epoch": 1.31,
"grad_norm": 3.2773740128221367,
"learning_rate": 1.2836781334618912e-05,
"loss": 0.1706,
"step": 16280
},
{
"epoch": 1.32,
"grad_norm": 7.100024972791139,
"learning_rate": 1.2819712414602112e-05,
"loss": 0.1725,
"step": 16300
},
{
"epoch": 1.32,
"grad_norm": 6.847007498771447,
"learning_rate": 1.2802634566918806e-05,
"loss": 0.2224,
"step": 16320
},
{
"epoch": 1.32,
"grad_norm": 18.124907394698774,
"learning_rate": 1.2785547845659412e-05,
"loss": 0.1954,
"step": 16340
},
{
"epoch": 1.32,
"grad_norm": 6.653915067377773,
"learning_rate": 1.2768452304942449e-05,
"loss": 0.195,
"step": 16360
},
{
"epoch": 1.32,
"grad_norm": 13.653829050105037,
"learning_rate": 1.275134799891438e-05,
"loss": 0.1771,
"step": 16380
},
{
"epoch": 1.32,
"grad_norm": 5.199783489414919,
"learning_rate": 1.2734234981749416e-05,
"loss": 0.1697,
"step": 16400
},
{
"epoch": 1.33,
"grad_norm": 4.938204944360832,
"learning_rate": 1.2717113307649367e-05,
"loss": 0.2153,
"step": 16420
},
{
"epoch": 1.33,
"grad_norm": 5.379988728337729,
"learning_rate": 1.2699983030843462e-05,
"loss": 0.1807,
"step": 16440
},
{
"epoch": 1.33,
"grad_norm": 5.563035060565681,
"learning_rate": 1.2682844205588175e-05,
"loss": 0.1723,
"step": 16460
},
{
"epoch": 1.33,
"grad_norm": 4.137830028452154,
"learning_rate": 1.2665696886167054e-05,
"loss": 0.2015,
"step": 16480
},
{
"epoch": 1.33,
"grad_norm": 5.444985685023187,
"learning_rate": 1.2648541126890553e-05,
"loss": 0.1891,
"step": 16500
},
{
"epoch": 1.33,
"grad_norm": 6.758562379366711,
"learning_rate": 1.2631376982095857e-05,
"loss": 0.1794,
"step": 16520
},
{
"epoch": 1.34,
"grad_norm": 6.42547461986865,
"learning_rate": 1.2614204506146714e-05,
"loss": 0.2072,
"step": 16540
},
{
"epoch": 1.34,
"grad_norm": 6.67478604205973,
"learning_rate": 1.2597023753433248e-05,
"loss": 0.1752,
"step": 16560
},
{
"epoch": 1.34,
"grad_norm": 5.114865214342277,
"learning_rate": 1.2579834778371814e-05,
"loss": 0.2129,
"step": 16580
},
{
"epoch": 1.34,
"grad_norm": 6.721543126268775,
"learning_rate": 1.2562637635404791e-05,
"loss": 0.1774,
"step": 16600
},
{
"epoch": 1.34,
"grad_norm": 5.93001108203901,
"learning_rate": 1.2545432379000448e-05,
"loss": 0.1773,
"step": 16620
},
{
"epoch": 1.34,
"grad_norm": 5.7523308051939095,
"learning_rate": 1.2528219063652729e-05,
"loss": 0.2078,
"step": 16640
},
{
"epoch": 1.34,
"grad_norm": 6.871500685256494,
"learning_rate": 1.2510997743881129e-05,
"loss": 0.1804,
"step": 16660
},
{
"epoch": 1.35,
"grad_norm": 6.975118558142685,
"learning_rate": 1.249376847423047e-05,
"loss": 0.1923,
"step": 16680
},
{
"epoch": 1.35,
"grad_norm": 5.518168870477368,
"learning_rate": 1.2476531309270773e-05,
"loss": 0.2043,
"step": 16700
},
{
"epoch": 1.35,
"grad_norm": 7.024733851268127,
"learning_rate": 1.2459286303597055e-05,
"loss": 0.1957,
"step": 16720
},
{
"epoch": 1.35,
"grad_norm": 5.189450368424653,
"learning_rate": 1.244203351182917e-05,
"loss": 0.1972,
"step": 16740
},
{
"epoch": 1.35,
"grad_norm": 4.769858359227226,
"learning_rate": 1.2424772988611631e-05,
"loss": 0.2045,
"step": 16760
},
{
"epoch": 1.35,
"grad_norm": 5.6345946816574575,
"learning_rate": 1.2407504788613441e-05,
"loss": 0.184,
"step": 16780
},
{
"epoch": 1.36,
"grad_norm": 8.03446545781803,
"learning_rate": 1.2390228966527917e-05,
"loss": 0.2016,
"step": 16800
},
{
"epoch": 1.36,
"grad_norm": 5.376750572260963,
"learning_rate": 1.2372945577072516e-05,
"loss": 0.221,
"step": 16820
},
{
"epoch": 1.36,
"grad_norm": 3.9735936922072836,
"learning_rate": 1.2355654674988669e-05,
"loss": 0.2193,
"step": 16840
},
{
"epoch": 1.36,
"grad_norm": 4.966981585117317,
"learning_rate": 1.2338356315041587e-05,
"loss": 0.1788,
"step": 16860
},
{
"epoch": 1.36,
"grad_norm": 5.389331607177196,
"learning_rate": 1.232105055202012e-05,
"loss": 0.2325,
"step": 16880
},
{
"epoch": 1.36,
"grad_norm": 3.4531020772181753,
"learning_rate": 1.2303737440736553e-05,
"loss": 0.1978,
"step": 16900
},
{
"epoch": 1.37,
"grad_norm": 8.524005426914416,
"learning_rate": 1.2286417036026454e-05,
"loss": 0.2219,
"step": 16920
},
{
"epoch": 1.37,
"grad_norm": 8.61532713797543,
"learning_rate": 1.2269089392748484e-05,
"loss": 0.1786,
"step": 16940
},
{
"epoch": 1.37,
"grad_norm": 7.510098080661287,
"learning_rate": 1.225175456578423e-05,
"loss": 0.192,
"step": 16960
},
{
"epoch": 1.37,
"grad_norm": 5.7571280705405075,
"learning_rate": 1.2234412610038045e-05,
"loss": 0.1884,
"step": 16980
},
{
"epoch": 1.37,
"grad_norm": 6.431467433327115,
"learning_rate": 1.2217063580436841e-05,
"loss": 0.1861,
"step": 17000
},
{
"epoch": 1.37,
"grad_norm": 5.997538061494255,
"learning_rate": 1.219970753192995e-05,
"loss": 0.196,
"step": 17020
},
{
"epoch": 1.38,
"grad_norm": 5.041053272270793,
"learning_rate": 1.218234451948893e-05,
"loss": 0.1676,
"step": 17040
},
{
"epoch": 1.38,
"grad_norm": 8.26975958810288,
"learning_rate": 1.2164974598107398e-05,
"loss": 0.1953,
"step": 17060
},
{
"epoch": 1.38,
"grad_norm": 4.861930347014316,
"learning_rate": 1.2147597822800843e-05,
"loss": 0.2077,
"step": 17080
},
{
"epoch": 1.38,
"grad_norm": 6.4205005966339534,
"learning_rate": 1.2130214248606478e-05,
"loss": 0.1743,
"step": 17100
},
{
"epoch": 1.38,
"grad_norm": 6.274607928060438,
"learning_rate": 1.2112823930583042e-05,
"loss": 0.168,
"step": 17120
},
{
"epoch": 1.38,
"grad_norm": 8.376715185548848,
"learning_rate": 1.2095426923810631e-05,
"loss": 0.1821,
"step": 17140
},
{
"epoch": 1.39,
"grad_norm": 8.347057130026942,
"learning_rate": 1.2078023283390532e-05,
"loss": 0.201,
"step": 17160
},
{
"epoch": 1.39,
"grad_norm": 6.10042452976507,
"learning_rate": 1.2060613064445041e-05,
"loss": 0.1864,
"step": 17180
},
{
"epoch": 1.39,
"grad_norm": 6.882586091178972,
"learning_rate": 1.204319632211729e-05,
"loss": 0.1945,
"step": 17200
},
{
"epoch": 1.39,
"grad_norm": 4.831518270883375,
"learning_rate": 1.2025773111571067e-05,
"loss": 0.1997,
"step": 17220
},
{
"epoch": 1.39,
"grad_norm": 5.820557290888362,
"learning_rate": 1.2008343487990652e-05,
"loss": 0.213,
"step": 17240
},
{
"epoch": 1.39,
"grad_norm": 7.590766477620916,
"learning_rate": 1.199090750658064e-05,
"loss": 0.1943,
"step": 17260
},
{
"epoch": 1.39,
"grad_norm": 5.245303096541225,
"learning_rate": 1.1973465222565756e-05,
"loss": 0.1935,
"step": 17280
},
{
"epoch": 1.4,
"grad_norm": 5.635466946173672,
"learning_rate": 1.1956016691190693e-05,
"loss": 0.1937,
"step": 17300
},
{
"epoch": 1.4,
"grad_norm": 9.8612439882403,
"learning_rate": 1.1938561967719929e-05,
"loss": 0.1998,
"step": 17320
},
{
"epoch": 1.4,
"grad_norm": 7.57010186594224,
"learning_rate": 1.1921101107437547e-05,
"loss": 0.1859,
"step": 17340
},
{
"epoch": 1.4,
"grad_norm": 5.578111322637294,
"learning_rate": 1.190363416564708e-05,
"loss": 0.1885,
"step": 17360
},
{
"epoch": 1.4,
"grad_norm": 10.636687907364392,
"learning_rate": 1.188616119767132e-05,
"loss": 0.2183,
"step": 17380
},
{
"epoch": 1.4,
"grad_norm": 5.149639168848235,
"learning_rate": 1.1868682258852135e-05,
"loss": 0.1854,
"step": 17400
},
{
"epoch": 1.41,
"grad_norm": 7.37696372575026,
"learning_rate": 1.1851197404550314e-05,
"loss": 0.1859,
"step": 17420
},
{
"epoch": 1.41,
"grad_norm": 7.336027199681234,
"learning_rate": 1.183370669014538e-05,
"loss": 0.1685,
"step": 17440
},
{
"epoch": 1.41,
"grad_norm": 5.449592911101573,
"learning_rate": 1.181621017103542e-05,
"loss": 0.2028,
"step": 17460
},
{
"epoch": 1.41,
"grad_norm": 8.354625001227333,
"learning_rate": 1.1798707902636895e-05,
"loss": 0.1841,
"step": 17480
},
{
"epoch": 1.41,
"grad_norm": 6.278768324372431,
"learning_rate": 1.178119994038449e-05,
"loss": 0.1682,
"step": 17500
},
{
"epoch": 1.41,
"grad_norm": 6.121827218539604,
"learning_rate": 1.1763686339730911e-05,
"loss": 0.1864,
"step": 17520
},
{
"epoch": 1.42,
"grad_norm": 4.642245950398283,
"learning_rate": 1.174616715614673e-05,
"loss": 0.1919,
"step": 17540
},
{
"epoch": 1.42,
"grad_norm": 12.026938401540649,
"learning_rate": 1.1728642445120205e-05,
"loss": 0.1876,
"step": 17560
},
{
"epoch": 1.42,
"grad_norm": 5.259501261110696,
"learning_rate": 1.1711112262157093e-05,
"loss": 0.196,
"step": 17580
},
{
"epoch": 1.42,
"grad_norm": 6.612093751827706,
"learning_rate": 1.1693576662780486e-05,
"loss": 0.1811,
"step": 17600
},
{
"epoch": 1.42,
"grad_norm": 6.169870867251168,
"learning_rate": 1.167603570253063e-05,
"loss": 0.1955,
"step": 17620
},
{
"epoch": 1.42,
"grad_norm": 7.205754195375382,
"learning_rate": 1.1658489436964753e-05,
"loss": 0.1806,
"step": 17640
},
{
"epoch": 1.43,
"grad_norm": 7.67399813297936,
"learning_rate": 1.1640937921656882e-05,
"loss": 0.198,
"step": 17660
},
{
"epoch": 1.43,
"grad_norm": 5.102867345151343,
"learning_rate": 1.1623381212197677e-05,
"loss": 0.1663,
"step": 17680
},
{
"epoch": 1.43,
"grad_norm": 6.182588419638686,
"learning_rate": 1.1605819364194244e-05,
"loss": 0.1972,
"step": 17700
},
{
"epoch": 1.43,
"grad_norm": 8.729593950859055,
"learning_rate": 1.1588252433269966e-05,
"loss": 0.1978,
"step": 17720
},
{
"epoch": 1.43,
"grad_norm": 4.896187735642024,
"learning_rate": 1.1570680475064328e-05,
"loss": 0.181,
"step": 17740
},
{
"epoch": 1.43,
"grad_norm": 8.586607472154492,
"learning_rate": 1.1553103545232738e-05,
"loss": 0.1778,
"step": 17760
},
{
"epoch": 1.44,
"grad_norm": 7.077719973666291,
"learning_rate": 1.1535521699446344e-05,
"loss": 0.1881,
"step": 17780
},
{
"epoch": 1.44,
"grad_norm": 6.368844445064559,
"learning_rate": 1.151793499339187e-05,
"loss": 0.1837,
"step": 17800
},
{
"epoch": 1.44,
"grad_norm": 5.603581999541538,
"learning_rate": 1.1500343482771433e-05,
"loss": 0.1788,
"step": 17820
},
{
"epoch": 1.44,
"grad_norm": 6.124117106428645,
"learning_rate": 1.1482747223302362e-05,
"loss": 0.2073,
"step": 17840
},
{
"epoch": 1.44,
"grad_norm": 4.896094240852357,
"learning_rate": 1.146514627071704e-05,
"loss": 0.2018,
"step": 17860
},
{
"epoch": 1.44,
"grad_norm": 8.633063928041105,
"learning_rate": 1.1447540680762697e-05,
"loss": 0.187,
"step": 17880
},
{
"epoch": 1.44,
"grad_norm": 5.8650395795782595,
"learning_rate": 1.1429930509201264e-05,
"loss": 0.1884,
"step": 17900
},
{
"epoch": 1.45,
"grad_norm": 3.3374318833847245,
"learning_rate": 1.141231581180918e-05,
"loss": 0.1755,
"step": 17920
},
{
"epoch": 1.45,
"grad_norm": 9.55305523159018,
"learning_rate": 1.1394696644377216e-05,
"loss": 0.183,
"step": 17940
},
{
"epoch": 1.45,
"grad_norm": 5.489288302847405,
"learning_rate": 1.1377073062710309e-05,
"loss": 0.1963,
"step": 17960
},
{
"epoch": 1.45,
"grad_norm": 5.8983191471983805,
"learning_rate": 1.1359445122627362e-05,
"loss": 0.1895,
"step": 17980
},
{
"epoch": 1.45,
"grad_norm": 7.189465859300051,
"learning_rate": 1.1341812879961095e-05,
"loss": 0.1673,
"step": 18000
},
{
"epoch": 1.45,
"grad_norm": 7.447490489655281,
"learning_rate": 1.1324176390557853e-05,
"loss": 0.1809,
"step": 18020
},
{
"epoch": 1.46,
"grad_norm": 5.775175955597824,
"learning_rate": 1.1306535710277428e-05,
"loss": 0.1791,
"step": 18040
},
{
"epoch": 1.46,
"grad_norm": 8.2814761724716,
"learning_rate": 1.1288890894992888e-05,
"loss": 0.1819,
"step": 18060
},
{
"epoch": 1.46,
"grad_norm": 4.995617280837431,
"learning_rate": 1.12712420005904e-05,
"loss": 0.1925,
"step": 18080
},
{
"epoch": 1.46,
"grad_norm": 5.884038263005907,
"learning_rate": 1.1253589082969046e-05,
"loss": 0.1854,
"step": 18100
},
{
"epoch": 1.46,
"grad_norm": 4.022777792168135,
"learning_rate": 1.1235932198040653e-05,
"loss": 0.1728,
"step": 18120
},
{
"epoch": 1.46,
"grad_norm": 4.792975515098119,
"learning_rate": 1.1218271401729617e-05,
"loss": 0.1836,
"step": 18140
},
{
"epoch": 1.47,
"grad_norm": 5.250400164657746,
"learning_rate": 1.1200606749972718e-05,
"loss": 0.167,
"step": 18160
},
{
"epoch": 1.47,
"grad_norm": 7.457426673152692,
"learning_rate": 1.1182938298718945e-05,
"loss": 0.1829,
"step": 18180
},
{
"epoch": 1.47,
"grad_norm": 14.206800331502505,
"learning_rate": 1.1165266103929328e-05,
"loss": 0.1778,
"step": 18200
},
{
"epoch": 1.47,
"grad_norm": 6.502359695534185,
"learning_rate": 1.1147590221576754e-05,
"loss": 0.1799,
"step": 18220
},
{
"epoch": 1.47,
"grad_norm": 7.1040996052400525,
"learning_rate": 1.1129910707645779e-05,
"loss": 0.1917,
"step": 18240
},
{
"epoch": 1.47,
"grad_norm": 6.809597692047531,
"learning_rate": 1.1112227618132472e-05,
"loss": 0.1584,
"step": 18260
},
{
"epoch": 1.48,
"grad_norm": 6.214106220478112,
"learning_rate": 1.1094541009044219e-05,
"loss": 0.1745,
"step": 18280
},
{
"epoch": 1.48,
"grad_norm": 6.0818007467761,
"learning_rate": 1.1076850936399564e-05,
"loss": 0.1811,
"step": 18300
},
{
"epoch": 1.48,
"grad_norm": 7.541164852188345,
"learning_rate": 1.1059157456228008e-05,
"loss": 0.1642,
"step": 18320
},
{
"epoch": 1.48,
"grad_norm": 8.065093340490257,
"learning_rate": 1.104146062456986e-05,
"loss": 0.1801,
"step": 18340
},
{
"epoch": 1.48,
"grad_norm": 6.675920675840688,
"learning_rate": 1.1023760497476028e-05,
"loss": 0.1756,
"step": 18360
},
{
"epoch": 1.48,
"grad_norm": 6.6987902445864655,
"learning_rate": 1.1006057131007866e-05,
"loss": 0.1795,
"step": 18380
},
{
"epoch": 1.49,
"grad_norm": 6.510299738533309,
"learning_rate": 1.0988350581236991e-05,
"loss": 0.1865,
"step": 18400
},
{
"epoch": 1.49,
"grad_norm": 6.34510225061825,
"learning_rate": 1.0970640904245094e-05,
"loss": 0.1955,
"step": 18420
},
{
"epoch": 1.49,
"grad_norm": 4.228947726190241,
"learning_rate": 1.0952928156123781e-05,
"loss": 0.175,
"step": 18440
},
{
"epoch": 1.49,
"grad_norm": 6.238306015624761,
"learning_rate": 1.0935212392974372e-05,
"loss": 0.1767,
"step": 18460
},
{
"epoch": 1.49,
"grad_norm": 5.8825690027835424,
"learning_rate": 1.0917493670907751e-05,
"loss": 0.1804,
"step": 18480
},
{
"epoch": 1.49,
"grad_norm": 5.307188542805111,
"learning_rate": 1.0899772046044157e-05,
"loss": 0.2165,
"step": 18500
},
{
"epoch": 1.49,
"grad_norm": 7.671752182857302,
"learning_rate": 1.0882047574513045e-05,
"loss": 0.1754,
"step": 18520
},
{
"epoch": 1.5,
"grad_norm": 7.228224840476562,
"learning_rate": 1.0864320312452865e-05,
"loss": 0.1749,
"step": 18540
},
{
"epoch": 1.5,
"grad_norm": 7.022023025670514,
"learning_rate": 1.0846590316010918e-05,
"loss": 0.1815,
"step": 18560
},
{
"epoch": 1.5,
"grad_norm": 11.59241800805326,
"learning_rate": 1.082885764134316e-05,
"loss": 0.1947,
"step": 18580
},
{
"epoch": 1.5,
"grad_norm": 5.496283576035116,
"learning_rate": 1.081112234461403e-05,
"loss": 0.191,
"step": 18600
},
{
"epoch": 1.5,
"grad_norm": 8.37203471510935,
"learning_rate": 1.0793384481996279e-05,
"loss": 0.2098,
"step": 18620
},
{
"epoch": 1.5,
"grad_norm": 4.85467675057146,
"learning_rate": 1.0775644109670778e-05,
"loss": 0.199,
"step": 18640
},
{
"epoch": 1.51,
"grad_norm": 7.2927775672082165,
"learning_rate": 1.0757901283826341e-05,
"loss": 0.1763,
"step": 18660
},
{
"epoch": 1.51,
"grad_norm": 4.587898523966167,
"learning_rate": 1.0740156060659565e-05,
"loss": 0.1933,
"step": 18680
},
{
"epoch": 1.51,
"grad_norm": 4.846531420101418,
"learning_rate": 1.0722408496374634e-05,
"loss": 0.1605,
"step": 18700
},
{
"epoch": 1.51,
"grad_norm": 6.357471377975472,
"learning_rate": 1.0704658647183155e-05,
"loss": 0.1911,
"step": 18720
},
{
"epoch": 1.51,
"grad_norm": 4.884941528032838,
"learning_rate": 1.0686906569303955e-05,
"loss": 0.1613,
"step": 18740
},
{
"epoch": 1.51,
"grad_norm": 6.578672698862572,
"learning_rate": 1.0669152318962936e-05,
"loss": 0.1583,
"step": 18760
},
{
"epoch": 1.52,
"grad_norm": 4.996568837666117,
"learning_rate": 1.0651395952392876e-05,
"loss": 0.1893,
"step": 18780
},
{
"epoch": 1.52,
"grad_norm": 6.137635879410934,
"learning_rate": 1.0633637525833246e-05,
"loss": 0.2006,
"step": 18800
},
{
"epoch": 1.52,
"grad_norm": 10.339616116567784,
"learning_rate": 1.0615877095530058e-05,
"loss": 0.2034,
"step": 18820
},
{
"epoch": 1.52,
"grad_norm": 5.500147904228012,
"learning_rate": 1.0598114717735661e-05,
"loss": 0.1989,
"step": 18840
},
{
"epoch": 1.52,
"grad_norm": 5.340910932588536,
"learning_rate": 1.0580350448708571e-05,
"loss": 0.2044,
"step": 18860
},
{
"epoch": 1.52,
"grad_norm": 7.0020980415617275,
"learning_rate": 1.0562584344713301e-05,
"loss": 0.1873,
"step": 18880
},
{
"epoch": 1.53,
"grad_norm": 6.566893932198695,
"learning_rate": 1.0544816462020169e-05,
"loss": 0.1672,
"step": 18900
},
{
"epoch": 1.53,
"grad_norm": 21.295579564416062,
"learning_rate": 1.052704685690513e-05,
"loss": 0.1728,
"step": 18920
},
{
"epoch": 1.53,
"grad_norm": 5.869527987425055,
"learning_rate": 1.0509275585649594e-05,
"loss": 0.2102,
"step": 18940
},
{
"epoch": 1.53,
"grad_norm": 15.699095798380197,
"learning_rate": 1.0491502704540249e-05,
"loss": 0.1861,
"step": 18960
},
{
"epoch": 1.53,
"grad_norm": 4.514889983973343,
"learning_rate": 1.0473728269868879e-05,
"loss": 0.189,
"step": 18980
},
{
"epoch": 1.53,
"grad_norm": 6.288009726033927,
"learning_rate": 1.045595233793219e-05,
"loss": 0.1626,
"step": 19000
},
{
"epoch": 1.54,
"grad_norm": 5.491080420793532,
"learning_rate": 1.0438174965031632e-05,
"loss": 0.1763,
"step": 19020
},
{
"epoch": 1.54,
"grad_norm": 5.751789643793079,
"learning_rate": 1.0420396207473214e-05,
"loss": 0.1938,
"step": 19040
},
{
"epoch": 1.54,
"grad_norm": 6.8267007909359,
"learning_rate": 1.0402616121567339e-05,
"loss": 0.1965,
"step": 19060
},
{
"epoch": 1.54,
"grad_norm": 9.0429514544921,
"learning_rate": 1.0384834763628609e-05,
"loss": 0.1956,
"step": 19080
},
{
"epoch": 1.54,
"grad_norm": 4.883758892659512,
"learning_rate": 1.0367052189975661e-05,
"loss": 0.2052,
"step": 19100
},
{
"epoch": 1.54,
"grad_norm": 4.958192369182968,
"learning_rate": 1.0349268456930978e-05,
"loss": 0.1595,
"step": 19120
},
{
"epoch": 1.55,
"grad_norm": 7.1048523990512775,
"learning_rate": 1.0331483620820718e-05,
"loss": 0.1802,
"step": 19140
},
{
"epoch": 1.55,
"grad_norm": 7.81188718299639,
"learning_rate": 1.0313697737974532e-05,
"loss": 0.1762,
"step": 19160
},
{
"epoch": 1.55,
"grad_norm": 7.41902623164456,
"learning_rate": 1.0295910864725385e-05,
"loss": 0.1815,
"step": 19180
},
{
"epoch": 1.55,
"grad_norm": 6.487508949464715,
"learning_rate": 1.027812305740938e-05,
"loss": 0.1868,
"step": 19200
},
{
"epoch": 1.55,
"grad_norm": 6.2665205241809,
"learning_rate": 1.0260334372365579e-05,
"loss": 0.1786,
"step": 19220
},
{
"epoch": 1.55,
"grad_norm": 10.235739114720019,
"learning_rate": 1.0242544865935822e-05,
"loss": 0.1974,
"step": 19240
},
{
"epoch": 1.55,
"grad_norm": 7.9210272040028205,
"learning_rate": 1.0224754594464548e-05,
"loss": 0.1995,
"step": 19260
},
{
"epoch": 1.56,
"grad_norm": 7.599636480772786,
"learning_rate": 1.020696361429863e-05,
"loss": 0.1838,
"step": 19280
},
{
"epoch": 1.56,
"grad_norm": 6.6901787357374305,
"learning_rate": 1.0189171981787176e-05,
"loss": 0.1857,
"step": 19300
},
{
"epoch": 1.56,
"grad_norm": 7.859622048956387,
"learning_rate": 1.0171379753281365e-05,
"loss": 0.1473,
"step": 19320
},
{
"epoch": 1.56,
"grad_norm": 9.023527968972886,
"learning_rate": 1.015358698513426e-05,
"loss": 0.2087,
"step": 19340
},
{
"epoch": 1.56,
"grad_norm": 7.332951564677237,
"learning_rate": 1.0135793733700635e-05,
"loss": 0.1723,
"step": 19360
},
{
"epoch": 1.56,
"grad_norm": 6.6064475500042406,
"learning_rate": 1.0118000055336792e-05,
"loss": 0.1892,
"step": 19380
},
{
"epoch": 1.57,
"grad_norm": 7.156751057104011,
"learning_rate": 1.0100206006400388e-05,
"loss": 0.1808,
"step": 19400
},
{
"epoch": 1.57,
"grad_norm": 7.071734037861301,
"learning_rate": 1.0082411643250256e-05,
"loss": 0.1987,
"step": 19420
},
{
"epoch": 1.57,
"grad_norm": 5.576705794819434,
"learning_rate": 1.0064617022246218e-05,
"loss": 0.1826,
"step": 19440
},
{
"epoch": 1.57,
"grad_norm": 9.630333945880572,
"learning_rate": 1.0046822199748918e-05,
"loss": 0.1778,
"step": 19460
},
{
"epoch": 1.57,
"grad_norm": 10.401189633333978,
"learning_rate": 1.0029027232119637e-05,
"loss": 0.1834,
"step": 19480
},
{
"epoch": 1.57,
"grad_norm": 5.393374772794798,
"learning_rate": 1.0011232175720113e-05,
"loss": 0.1738,
"step": 19500
},
{
"epoch": 1.58,
"grad_norm": 6.924075773515395,
"learning_rate": 9.993437086912373e-06,
"loss": 0.1917,
"step": 19520
},
{
"epoch": 1.58,
"grad_norm": 8.996239194302989,
"learning_rate": 9.975642022058535e-06,
"loss": 0.164,
"step": 19540
},
{
"epoch": 1.58,
"grad_norm": 6.440697625799741,
"learning_rate": 9.95784703752065e-06,
"loss": 0.1846,
"step": 19560
},
{
"epoch": 1.58,
"grad_norm": 4.738285879209919,
"learning_rate": 9.940052189660508e-06,
"loss": 0.2179,
"step": 19580
},
{
"epoch": 1.58,
"grad_norm": 6.078747244317922,
"learning_rate": 9.922257534839473e-06,
"loss": 0.1678,
"step": 19600
},
{
"epoch": 1.58,
"grad_norm": 4.399307517265612,
"learning_rate": 9.904463129418295e-06,
"loss": 0.188,
"step": 19620
},
{
"epoch": 1.59,
"grad_norm": 6.3147663542594135,
"learning_rate": 9.886669029756928e-06,
"loss": 0.1814,
"step": 19640
},
{
"epoch": 1.59,
"grad_norm": 10.295074351756918,
"learning_rate": 9.86887529221437e-06,
"loss": 0.1594,
"step": 19660
},
{
"epoch": 1.59,
"grad_norm": 6.127518428057287,
"learning_rate": 9.851081973148461e-06,
"loss": 0.1583,
"step": 19680
},
{
"epoch": 1.59,
"grad_norm": 7.524745554123201,
"learning_rate": 9.833289128915719e-06,
"loss": 0.1725,
"step": 19700
},
{
"epoch": 1.59,
"grad_norm": 8.809630447586713,
"learning_rate": 9.815496815871163e-06,
"loss": 0.1835,
"step": 19720
},
{
"epoch": 1.59,
"grad_norm": 5.0997599152545705,
"learning_rate": 9.79770509036812e-06,
"loss": 0.1918,
"step": 19740
},
{
"epoch": 1.6,
"grad_norm": 5.7257380913445415,
"learning_rate": 9.779914008758064e-06,
"loss": 0.179,
"step": 19760
},
{
"epoch": 1.6,
"grad_norm": 6.438675856373519,
"learning_rate": 9.762123627390428e-06,
"loss": 0.2072,
"step": 19780
},
{
"epoch": 1.6,
"grad_norm": 7.284638627304715,
"learning_rate": 9.744334002612426e-06,
"loss": 0.1655,
"step": 19800
},
{
"epoch": 1.6,
"grad_norm": 7.624618520730721,
"learning_rate": 9.726545190768871e-06,
"loss": 0.1907,
"step": 19820
},
{
"epoch": 1.6,
"grad_norm": 5.640863218058795,
"learning_rate": 9.70875724820201e-06,
"loss": 0.1743,
"step": 19840
},
{
"epoch": 1.6,
"grad_norm": 6.508589424568692,
"learning_rate": 9.690970231251332e-06,
"loss": 0.1778,
"step": 19860
},
{
"epoch": 1.6,
"grad_norm": 4.719808699723677,
"learning_rate": 9.673184196253397e-06,
"loss": 0.1842,
"step": 19880
},
{
"epoch": 1.61,
"grad_norm": 5.327888908637108,
"learning_rate": 9.655399199541648e-06,
"loss": 0.1778,
"step": 19900
},
{
"epoch": 1.61,
"grad_norm": 4.62310607898033,
"learning_rate": 9.63761529744625e-06,
"loss": 0.159,
"step": 19920
},
{
"epoch": 1.61,
"grad_norm": 4.242984591464857,
"learning_rate": 9.61983254629389e-06,
"loss": 0.1766,
"step": 19940
},
{
"epoch": 1.61,
"grad_norm": 4.634594013494804,
"learning_rate": 9.60205100240762e-06,
"loss": 0.186,
"step": 19960
},
{
"epoch": 1.61,
"grad_norm": 6.440683707201506,
"learning_rate": 9.584270722106662e-06,
"loss": 0.1856,
"step": 19980
},
{
"epoch": 1.61,
"grad_norm": 2.790764895496361,
"learning_rate": 9.566491761706234e-06,
"loss": 0.1841,
"step": 20000
},
{
"epoch": 1.62,
"grad_norm": 7.195201164651666,
"learning_rate": 9.54871417751738e-06,
"loss": 0.1723,
"step": 20020
},
{
"epoch": 1.62,
"grad_norm": 4.764709985159199,
"learning_rate": 9.530938025846778e-06,
"loss": 0.1866,
"step": 20040
},
{
"epoch": 1.62,
"grad_norm": 5.932858550259948,
"learning_rate": 9.513163362996577e-06,
"loss": 0.1866,
"step": 20060
},
{
"epoch": 1.62,
"grad_norm": 6.464021142906132,
"learning_rate": 9.495390245264204e-06,
"loss": 0.1868,
"step": 20080
},
{
"epoch": 1.62,
"grad_norm": 5.655476722759092,
"learning_rate": 9.477618728942194e-06,
"loss": 0.166,
"step": 20100
},
{
"epoch": 1.62,
"grad_norm": 5.962336904666332,
"learning_rate": 9.459848870318007e-06,
"loss": 0.2101,
"step": 20120
},
{
"epoch": 1.63,
"grad_norm": 4.675684775725181,
"learning_rate": 9.44208072567386e-06,
"loss": 0.1772,
"step": 20140
},
{
"epoch": 1.63,
"grad_norm": 7.450883124319922,
"learning_rate": 9.42431435128654e-06,
"loss": 0.1647,
"step": 20160
},
{
"epoch": 1.63,
"grad_norm": 4.965583223967794,
"learning_rate": 9.406549803427218e-06,
"loss": 0.2103,
"step": 20180
},
{
"epoch": 1.63,
"grad_norm": 6.079517195254289,
"learning_rate": 9.388787138361289e-06,
"loss": 0.1917,
"step": 20200
},
{
"epoch": 1.63,
"grad_norm": 5.5400797739723515,
"learning_rate": 9.371026412348178e-06,
"loss": 0.1691,
"step": 20220
},
{
"epoch": 1.63,
"grad_norm": 5.974428361817412,
"learning_rate": 9.353267681641178e-06,
"loss": 0.1887,
"step": 20240
},
{
"epoch": 1.64,
"grad_norm": 5.87274126501588,
"learning_rate": 9.335511002487252e-06,
"loss": 0.1888,
"step": 20260
},
{
"epoch": 1.64,
"grad_norm": 5.03986866734462,
"learning_rate": 9.31775643112687e-06,
"loss": 0.1793,
"step": 20280
},
{
"epoch": 1.64,
"grad_norm": 4.961609321820686,
"learning_rate": 9.300004023793826e-06,
"loss": 0.1811,
"step": 20300
},
{
"epoch": 1.64,
"grad_norm": 6.09235980486571,
"learning_rate": 9.282253836715063e-06,
"loss": 0.1699,
"step": 20320
},
{
"epoch": 1.64,
"grad_norm": 6.186755202964333,
"learning_rate": 9.264505926110482e-06,
"loss": 0.1936,
"step": 20340
},
{
"epoch": 1.64,
"grad_norm": 6.795058856229219,
"learning_rate": 9.246760348192785e-06,
"loss": 0.1988,
"step": 20360
},
{
"epoch": 1.65,
"grad_norm": 8.626010116914388,
"learning_rate": 9.229017159167278e-06,
"loss": 0.1753,
"step": 20380
},
{
"epoch": 1.65,
"grad_norm": 7.036939700979724,
"learning_rate": 9.211276415231704e-06,
"loss": 0.1775,
"step": 20400
},
{
"epoch": 1.65,
"grad_norm": 7.222151578247956,
"learning_rate": 9.193538172576061e-06,
"loss": 0.2063,
"step": 20420
},
{
"epoch": 1.65,
"grad_norm": 6.261687984855021,
"learning_rate": 9.175802487382427e-06,
"loss": 0.1875,
"step": 20440
},
{
"epoch": 1.65,
"grad_norm": 6.763660595031022,
"learning_rate": 9.158069415824776e-06,
"loss": 0.162,
"step": 20460
},
{
"epoch": 1.65,
"grad_norm": 6.645406711322244,
"learning_rate": 9.140339014068805e-06,
"loss": 0.1701,
"step": 20480
},
{
"epoch": 1.65,
"grad_norm": 6.075259280852945,
"learning_rate": 9.122611338271759e-06,
"loss": 0.1876,
"step": 20500
},
{
"epoch": 1.66,
"grad_norm": 5.071793679309942,
"learning_rate": 9.104886444582239e-06,
"loss": 0.1891,
"step": 20520
},
{
"epoch": 1.66,
"grad_norm": 6.0993248112456,
"learning_rate": 9.087164389140048e-06,
"loss": 0.1773,
"step": 20540
},
{
"epoch": 1.66,
"grad_norm": 3.137955931101823,
"learning_rate": 9.069445228075984e-06,
"loss": 0.175,
"step": 20560
},
{
"epoch": 1.66,
"grad_norm": 6.87017680822122,
"learning_rate": 9.051729017511696e-06,
"loss": 0.1781,
"step": 20580
},
{
"epoch": 1.66,
"grad_norm": 6.740478016007273,
"learning_rate": 9.034015813559472e-06,
"loss": 0.1842,
"step": 20600
},
{
"epoch": 1.66,
"grad_norm": 6.293414885644483,
"learning_rate": 9.016305672322082e-06,
"loss": 0.1754,
"step": 20620
},
{
"epoch": 1.67,
"grad_norm": 6.770107414261222,
"learning_rate": 8.998598649892602e-06,
"loss": 0.1832,
"step": 20640
},
{
"epoch": 1.67,
"grad_norm": 6.957667275109727,
"learning_rate": 8.98089480235422e-06,
"loss": 0.1915,
"step": 20660
},
{
"epoch": 1.67,
"grad_norm": 7.753279699575756,
"learning_rate": 8.963194185780076e-06,
"loss": 0.2074,
"step": 20680
},
{
"epoch": 1.67,
"grad_norm": 5.572695528675712,
"learning_rate": 8.94549685623307e-06,
"loss": 0.1675,
"step": 20700
},
{
"epoch": 1.67,
"grad_norm": 8.318273389362469,
"learning_rate": 8.927802869765697e-06,
"loss": 0.1901,
"step": 20720
},
{
"epoch": 1.67,
"grad_norm": 6.1756703742616645,
"learning_rate": 8.91011228241986e-06,
"loss": 0.1759,
"step": 20740
},
{
"epoch": 1.68,
"grad_norm": 4.72678645840794,
"learning_rate": 8.892425150226697e-06,
"loss": 0.1672,
"step": 20760
},
{
"epoch": 1.68,
"grad_norm": 5.064164253114183,
"learning_rate": 8.874741529206401e-06,
"loss": 0.1832,
"step": 20780
},
{
"epoch": 1.68,
"grad_norm": 5.593557173319701,
"learning_rate": 8.857061475368046e-06,
"loss": 0.1767,
"step": 20800
},
{
"epoch": 1.68,
"grad_norm": 5.739827985678297,
"learning_rate": 8.83938504470941e-06,
"loss": 0.1633,
"step": 20820
},
{
"epoch": 1.68,
"grad_norm": 3.8745235027093172,
"learning_rate": 8.821712293216792e-06,
"loss": 0.1827,
"step": 20840
},
{
"epoch": 1.68,
"grad_norm": 5.758169590139612,
"learning_rate": 8.804043276864838e-06,
"loss": 0.1799,
"step": 20860
},
{
"epoch": 1.69,
"grad_norm": 9.048806835785985,
"learning_rate": 8.786378051616363e-06,
"loss": 0.1818,
"step": 20880
},
{
"epoch": 1.69,
"grad_norm": 4.6191342246916545,
"learning_rate": 8.768716673422176e-06,
"loss": 0.184,
"step": 20900
},
{
"epoch": 1.69,
"grad_norm": 10.64178899596623,
"learning_rate": 8.751059198220903e-06,
"loss": 0.1868,
"step": 20920
},
{
"epoch": 1.69,
"grad_norm": 5.957903048399736,
"learning_rate": 8.733405681938806e-06,
"loss": 0.2088,
"step": 20940
},
{
"epoch": 1.69,
"grad_norm": 7.14845190828466,
"learning_rate": 8.715756180489609e-06,
"loss": 0.1591,
"step": 20960
},
{
"epoch": 1.69,
"grad_norm": 5.026492884909507,
"learning_rate": 8.698110749774315e-06,
"loss": 0.1692,
"step": 20980
},
{
"epoch": 1.7,
"grad_norm": 6.193357527199346,
"learning_rate": 8.680469445681042e-06,
"loss": 0.1865,
"step": 21000
},
{
"epoch": 1.7,
"grad_norm": 5.392241393190001,
"learning_rate": 8.662832324084831e-06,
"loss": 0.1643,
"step": 21020
},
{
"epoch": 1.7,
"grad_norm": 7.209071685667427,
"learning_rate": 8.645199440847485e-06,
"loss": 0.1699,
"step": 21040
},
{
"epoch": 1.7,
"grad_norm": 6.378055513443605,
"learning_rate": 8.62757085181737e-06,
"loss": 0.1997,
"step": 21060
},
{
"epoch": 1.7,
"grad_norm": 5.948983100737581,
"learning_rate": 8.609946612829258e-06,
"loss": 0.1768,
"step": 21080
},
{
"epoch": 1.7,
"grad_norm": 5.843305438262167,
"learning_rate": 8.592326779704148e-06,
"loss": 0.1819,
"step": 21100
},
{
"epoch": 1.7,
"grad_norm": 4.695464831529554,
"learning_rate": 8.574711408249074e-06,
"loss": 0.1984,
"step": 21120
},
{
"epoch": 1.71,
"grad_norm": 5.334180148766731,
"learning_rate": 8.557100554256944e-06,
"loss": 0.18,
"step": 21140
},
{
"epoch": 1.71,
"grad_norm": 3.594041232042909,
"learning_rate": 8.53949427350636e-06,
"loss": 0.165,
"step": 21160
},
{
"epoch": 1.71,
"grad_norm": 5.690661212066448,
"learning_rate": 8.521892621761433e-06,
"loss": 0.2051,
"step": 21180
},
{
"epoch": 1.71,
"grad_norm": 8.263658257123618,
"learning_rate": 8.504295654771622e-06,
"loss": 0.178,
"step": 21200
},
{
"epoch": 1.71,
"grad_norm": 6.964730099763294,
"learning_rate": 8.486703428271536e-06,
"loss": 0.1718,
"step": 21220
},
{
"epoch": 1.71,
"grad_norm": 7.2364322382017745,
"learning_rate": 8.469115997980786e-06,
"loss": 0.1609,
"step": 21240
},
{
"epoch": 1.72,
"grad_norm": 7.01112598686971,
"learning_rate": 8.451533419603773e-06,
"loss": 0.1918,
"step": 21260
},
{
"epoch": 1.72,
"grad_norm": 6.7742526609944385,
"learning_rate": 8.433955748829543e-06,
"loss": 0.1746,
"step": 21280
},
{
"epoch": 1.72,
"grad_norm": 6.080225878083447,
"learning_rate": 8.416383041331594e-06,
"loss": 0.1621,
"step": 21300
},
{
"epoch": 1.72,
"grad_norm": 6.122578979331691,
"learning_rate": 8.398815352767706e-06,
"loss": 0.1866,
"step": 21320
},
{
"epoch": 1.72,
"grad_norm": 9.715953363199073,
"learning_rate": 8.38125273877976e-06,
"loss": 0.1696,
"step": 21340
},
{
"epoch": 1.72,
"grad_norm": 6.632818538811297,
"learning_rate": 8.363695254993569e-06,
"loss": 0.182,
"step": 21360
},
{
"epoch": 1.73,
"grad_norm": 5.18153129890793,
"learning_rate": 8.346142957018688e-06,
"loss": 0.2091,
"step": 21380
},
{
"epoch": 1.73,
"grad_norm": 4.933940908460862,
"learning_rate": 8.32859590044826e-06,
"loss": 0.1834,
"step": 21400
},
{
"epoch": 1.73,
"grad_norm": 4.130527376581761,
"learning_rate": 8.311054140858814e-06,
"loss": 0.217,
"step": 21420
},
{
"epoch": 1.73,
"grad_norm": 4.51550356163752,
"learning_rate": 8.29351773381011e-06,
"loss": 0.2001,
"step": 21440
},
{
"epoch": 1.73,
"grad_norm": 5.869039230613348,
"learning_rate": 8.275986734844956e-06,
"loss": 0.176,
"step": 21460
},
{
"epoch": 1.73,
"grad_norm": 5.039303039014279,
"learning_rate": 8.258461199489026e-06,
"loss": 0.2202,
"step": 21480
},
{
"epoch": 1.74,
"grad_norm": 6.5768380616493936,
"learning_rate": 8.240941183250689e-06,
"loss": 0.1748,
"step": 21500
},
{
"epoch": 1.74,
"grad_norm": 5.013498586372746,
"learning_rate": 8.22342674162084e-06,
"loss": 0.1933,
"step": 21520
},
{
"epoch": 1.74,
"grad_norm": 4.181027548764225,
"learning_rate": 8.205917930072707e-06,
"loss": 0.1706,
"step": 21540
},
{
"epoch": 1.74,
"grad_norm": 6.782829698366385,
"learning_rate": 8.188414804061698e-06,
"loss": 0.1857,
"step": 21560
},
{
"epoch": 1.74,
"grad_norm": 13.591366754088444,
"learning_rate": 8.170917419025203e-06,
"loss": 0.1467,
"step": 21580
},
{
"epoch": 1.74,
"grad_norm": 6.120949476303091,
"learning_rate": 8.153425830382438e-06,
"loss": 0.1991,
"step": 21600
},
{
"epoch": 1.75,
"grad_norm": 7.272557124401674,
"learning_rate": 8.135940093534249e-06,
"loss": 0.1766,
"step": 21620
},
{
"epoch": 1.75,
"grad_norm": 6.349382878150412,
"learning_rate": 8.11846026386296e-06,
"loss": 0.1989,
"step": 21640
},
{
"epoch": 1.75,
"grad_norm": 3.9867833656488356,
"learning_rate": 8.100986396732173e-06,
"loss": 0.1831,
"step": 21660
},
{
"epoch": 1.75,
"grad_norm": 5.4985809229416365,
"learning_rate": 8.083518547486617e-06,
"loss": 0.1851,
"step": 21680
},
{
"epoch": 1.75,
"grad_norm": 5.519318033191571,
"learning_rate": 8.066056771451954e-06,
"loss": 0.1879,
"step": 21700
},
{
"epoch": 1.75,
"grad_norm": 3.14973956651229,
"learning_rate": 8.048601123934609e-06,
"loss": 0.1737,
"step": 21720
},
{
"epoch": 1.75,
"grad_norm": 4.004131091247943,
"learning_rate": 8.031151660221597e-06,
"loss": 0.1667,
"step": 21740
},
{
"epoch": 1.76,
"grad_norm": 6.567536955091622,
"learning_rate": 8.013708435580352e-06,
"loss": 0.1697,
"step": 21760
},
{
"epoch": 1.76,
"grad_norm": 4.599731495525866,
"learning_rate": 7.996271505258542e-06,
"loss": 0.1547,
"step": 21780
},
{
"epoch": 1.76,
"grad_norm": 6.65599496558806,
"learning_rate": 7.978840924483904e-06,
"loss": 0.1774,
"step": 21800
},
{
"epoch": 1.76,
"grad_norm": 8.560171882828884,
"learning_rate": 7.961416748464055e-06,
"loss": 0.2049,
"step": 21820
},
{
"epoch": 1.76,
"grad_norm": 6.954024696005634,
"learning_rate": 7.943999032386336e-06,
"loss": 0.1881,
"step": 21840
},
{
"epoch": 1.76,
"grad_norm": 4.630461689811874,
"learning_rate": 7.926587831417623e-06,
"loss": 0.1881,
"step": 21860
},
{
"epoch": 1.77,
"grad_norm": 7.232546878139292,
"learning_rate": 7.90918320070416e-06,
"loss": 0.1995,
"step": 21880
},
{
"epoch": 1.77,
"grad_norm": 7.453903957263037,
"learning_rate": 7.891785195371375e-06,
"loss": 0.1722,
"step": 21900
},
{
"epoch": 1.77,
"grad_norm": 7.759776194383061,
"learning_rate": 7.874393870523715e-06,
"loss": 0.1695,
"step": 21920
},
{
"epoch": 1.77,
"grad_norm": 10.868257032537139,
"learning_rate": 7.857009281244472e-06,
"loss": 0.1835,
"step": 21940
},
{
"epoch": 1.77,
"grad_norm": 5.779984289233758,
"learning_rate": 7.839631482595597e-06,
"loss": 0.1665,
"step": 21960
},
{
"epoch": 1.77,
"grad_norm": 5.114836127824816,
"learning_rate": 7.822260529617539e-06,
"loss": 0.1882,
"step": 21980
},
{
"epoch": 1.78,
"grad_norm": 5.8070528491627105,
"learning_rate": 7.804896477329062e-06,
"loss": 0.2043,
"step": 22000
},
{
"epoch": 1.78,
"grad_norm": 3.7936436595812935,
"learning_rate": 7.787539380727074e-06,
"loss": 0.1828,
"step": 22020
},
{
"epoch": 1.78,
"grad_norm": 7.004553267660414,
"learning_rate": 7.770189294786455e-06,
"loss": 0.1891,
"step": 22040
},
{
"epoch": 1.78,
"grad_norm": 7.107618580250647,
"learning_rate": 7.752846274459873e-06,
"loss": 0.1952,
"step": 22060
},
{
"epoch": 1.78,
"grad_norm": 7.293012687334171,
"learning_rate": 7.735510374677624e-06,
"loss": 0.1668,
"step": 22080
},
{
"epoch": 1.78,
"grad_norm": 5.229665339832979,
"learning_rate": 7.718181650347453e-06,
"loss": 0.2154,
"step": 22100
},
{
"epoch": 1.79,
"grad_norm": 6.524237121183421,
"learning_rate": 7.70086015635437e-06,
"loss": 0.1834,
"step": 22120
},
{
"epoch": 1.79,
"grad_norm": 7.363125050150268,
"learning_rate": 7.683545947560491e-06,
"loss": 0.1865,
"step": 22140
},
{
"epoch": 1.79,
"grad_norm": 7.300432730612564,
"learning_rate": 7.666239078804853e-06,
"loss": 0.1818,
"step": 22160
},
{
"epoch": 1.79,
"grad_norm": 14.035100259942592,
"learning_rate": 7.648939604903252e-06,
"loss": 0.191,
"step": 22180
},
{
"epoch": 1.79,
"grad_norm": 6.910011337425548,
"learning_rate": 7.631647580648057e-06,
"loss": 0.168,
"step": 22200
},
{
"epoch": 1.79,
"grad_norm": 6.7568942995248324,
"learning_rate": 7.6143630608080395e-06,
"loss": 0.1843,
"step": 22220
},
{
"epoch": 1.8,
"grad_norm": 6.747067132204526,
"learning_rate": 7.597086100128209e-06,
"loss": 0.1937,
"step": 22240
},
{
"epoch": 1.8,
"grad_norm": 8.5527154190958,
"learning_rate": 7.579816753329629e-06,
"loss": 0.1818,
"step": 22260
},
{
"epoch": 1.8,
"grad_norm": 7.746379621946916,
"learning_rate": 7.562555075109248e-06,
"loss": 0.2052,
"step": 22280
},
{
"epoch": 1.8,
"grad_norm": 6.639988193271665,
"learning_rate": 7.545301120139724e-06,
"loss": 0.1631,
"step": 22300
},
{
"epoch": 1.8,
"grad_norm": 6.767819936156134,
"learning_rate": 7.528054943069261e-06,
"loss": 0.1661,
"step": 22320
},
{
"epoch": 1.8,
"grad_norm": 5.273647818433291,
"learning_rate": 7.510816598521416e-06,
"loss": 0.1584,
"step": 22340
},
{
"epoch": 1.8,
"grad_norm": 8.93563631273546,
"learning_rate": 7.493586141094952e-06,
"loss": 0.1555,
"step": 22360
},
{
"epoch": 1.81,
"grad_norm": 3.0753266447920566,
"learning_rate": 7.47636362536364e-06,
"loss": 0.1517,
"step": 22380
},
{
"epoch": 1.81,
"grad_norm": 10.30705792722804,
"learning_rate": 7.459149105876106e-06,
"loss": 0.154,
"step": 22400
},
{
"epoch": 1.81,
"grad_norm": 7.984708476707701,
"learning_rate": 7.441942637155638e-06,
"loss": 0.1671,
"step": 22420
},
{
"epoch": 1.81,
"grad_norm": 5.866654657582794,
"learning_rate": 7.424744273700038e-06,
"loss": 0.1886,
"step": 22440
},
{
"epoch": 1.81,
"grad_norm": 4.737602317329208,
"learning_rate": 7.407554069981428e-06,
"loss": 0.2059,
"step": 22460
},
{
"epoch": 1.81,
"grad_norm": 4.877816193781035,
"learning_rate": 7.390372080446089e-06,
"loss": 0.198,
"step": 22480
},
{
"epoch": 1.82,
"grad_norm": 4.465809741780349,
"learning_rate": 7.373198359514283e-06,
"loss": 0.1678,
"step": 22500
},
{
"epoch": 1.82,
"grad_norm": 6.934781753417284,
"learning_rate": 7.356032961580083e-06,
"loss": 0.18,
"step": 22520
},
{
"epoch": 1.82,
"grad_norm": 6.355881689873095,
"learning_rate": 7.338875941011206e-06,
"loss": 0.1676,
"step": 22540
},
{
"epoch": 1.82,
"grad_norm": 6.110876810996104,
"learning_rate": 7.321727352148833e-06,
"loss": 0.1855,
"step": 22560
},
{
"epoch": 1.82,
"grad_norm": 5.355037257245673,
"learning_rate": 7.304587249307434e-06,
"loss": 0.1804,
"step": 22580
},
{
"epoch": 1.82,
"grad_norm": 6.6548543550416195,
"learning_rate": 7.287455686774608e-06,
"loss": 0.2034,
"step": 22600
},
{
"epoch": 1.83,
"grad_norm": 9.27569310973663,
"learning_rate": 7.270332718810901e-06,
"loss": 0.1937,
"step": 22620
},
{
"epoch": 1.83,
"grad_norm": 4.751357838150305,
"learning_rate": 7.253218399649638e-06,
"loss": 0.1651,
"step": 22640
},
{
"epoch": 1.83,
"grad_norm": 9.734100288935727,
"learning_rate": 7.2361127834967505e-06,
"loss": 0.1529,
"step": 22660
},
{
"epoch": 1.83,
"grad_norm": 8.120082834507159,
"learning_rate": 7.219015924530608e-06,
"loss": 0.1747,
"step": 22680
},
{
"epoch": 1.83,
"grad_norm": 5.407215715714017,
"learning_rate": 7.201927876901839e-06,
"loss": 0.1704,
"step": 22700
},
{
"epoch": 1.83,
"grad_norm": 5.774551387388109,
"learning_rate": 7.184848694733164e-06,
"loss": 0.161,
"step": 22720
},
{
"epoch": 1.84,
"grad_norm": 6.759114928231145,
"learning_rate": 7.167778432119233e-06,
"loss": 0.1879,
"step": 22740
},
{
"epoch": 1.84,
"grad_norm": 4.92199338740308,
"learning_rate": 7.150717143126433e-06,
"loss": 0.1652,
"step": 22760
},
{
"epoch": 1.84,
"grad_norm": 4.178823107172061,
"learning_rate": 7.133664881792739e-06,
"loss": 0.1785,
"step": 22780
},
{
"epoch": 1.84,
"grad_norm": 6.686026437573051,
"learning_rate": 7.116621702127524e-06,
"loss": 0.1869,
"step": 22800
},
{
"epoch": 1.84,
"grad_norm": 4.2403070205517,
"learning_rate": 7.099587658111403e-06,
"loss": 0.1673,
"step": 22820
},
{
"epoch": 1.84,
"grad_norm": 3.965564681031053,
"learning_rate": 7.082562803696054e-06,
"loss": 0.1606,
"step": 22840
},
{
"epoch": 1.85,
"grad_norm": 7.1312004014078205,
"learning_rate": 7.065547192804044e-06,
"loss": 0.1833,
"step": 22860
},
{
"epoch": 1.85,
"grad_norm": 6.361412078237849,
"learning_rate": 7.048540879328677e-06,
"loss": 0.176,
"step": 22880
},
{
"epoch": 1.85,
"grad_norm": 7.594302059247436,
"learning_rate": 7.031543917133794e-06,
"loss": 0.1622,
"step": 22900
},
{
"epoch": 1.85,
"grad_norm": 10.096625612296556,
"learning_rate": 7.014556360053627e-06,
"loss": 0.1875,
"step": 22920
},
{
"epoch": 1.85,
"grad_norm": 5.429860679005328,
"learning_rate": 6.997578261892612e-06,
"loss": 0.1742,
"step": 22940
},
{
"epoch": 1.85,
"grad_norm": 4.64963548113418,
"learning_rate": 6.980609676425238e-06,
"loss": 0.1645,
"step": 22960
},
{
"epoch": 1.85,
"grad_norm": 7.328249932303926,
"learning_rate": 6.963650657395851e-06,
"loss": 0.1653,
"step": 22980
},
{
"epoch": 1.86,
"grad_norm": 4.797310178817107,
"learning_rate": 6.946701258518505e-06,
"loss": 0.1718,
"step": 23000
},
{
"epoch": 1.86,
"grad_norm": 5.93397915500725,
"learning_rate": 6.929761533476782e-06,
"loss": 0.171,
"step": 23020
},
{
"epoch": 1.86,
"grad_norm": 4.493879394366119,
"learning_rate": 6.912831535923627e-06,
"loss": 0.1596,
"step": 23040
},
{
"epoch": 1.86,
"grad_norm": 4.5526438818226564,
"learning_rate": 6.89591131948117e-06,
"loss": 0.1477,
"step": 23060
},
{
"epoch": 1.86,
"grad_norm": 5.57770560261374,
"learning_rate": 6.879000937740566e-06,
"loss": 0.1911,
"step": 23080
},
{
"epoch": 1.86,
"grad_norm": 5.825009574391105,
"learning_rate": 6.862100444261819e-06,
"loss": 0.1768,
"step": 23100
},
{
"epoch": 1.87,
"grad_norm": 5.136029725948361,
"learning_rate": 6.845209892573611e-06,
"loss": 0.1863,
"step": 23120
},
{
"epoch": 1.87,
"grad_norm": 7.989879882826286,
"learning_rate": 6.828329336173145e-06,
"loss": 0.1763,
"step": 23140
},
{
"epoch": 1.87,
"grad_norm": 6.65685803139121,
"learning_rate": 6.8114588285259576e-06,
"loss": 0.1755,
"step": 23160
},
{
"epoch": 1.87,
"grad_norm": 9.26693783187225,
"learning_rate": 6.794598423065758e-06,
"loss": 0.176,
"step": 23180
},
{
"epoch": 1.87,
"grad_norm": 3.574898543618821,
"learning_rate": 6.7777481731942616e-06,
"loss": 0.1858,
"step": 23200
},
{
"epoch": 1.87,
"grad_norm": 4.831086963108529,
"learning_rate": 6.760908132281021e-06,
"loss": 0.1796,
"step": 23220
},
{
"epoch": 1.88,
"grad_norm": 7.68773272402437,
"learning_rate": 6.744078353663247e-06,
"loss": 0.1703,
"step": 23240
},
{
"epoch": 1.88,
"grad_norm": 9.370545816880146,
"learning_rate": 6.727258890645652e-06,
"loss": 0.18,
"step": 23260
},
{
"epoch": 1.88,
"grad_norm": 9.159176720015646,
"learning_rate": 6.710449796500274e-06,
"loss": 0.1716,
"step": 23280
},
{
"epoch": 1.88,
"grad_norm": 5.362130304855524,
"learning_rate": 6.693651124466311e-06,
"loss": 0.168,
"step": 23300
},
{
"epoch": 1.88,
"grad_norm": 4.870253771176025,
"learning_rate": 6.676862927749953e-06,
"loss": 0.2008,
"step": 23320
},
{
"epoch": 1.88,
"grad_norm": 4.029937293840335,
"learning_rate": 6.6600852595242075e-06,
"loss": 0.1735,
"step": 23340
},
{
"epoch": 1.89,
"grad_norm": 5.6204264379241025,
"learning_rate": 6.643318172928737e-06,
"loss": 0.1707,
"step": 23360
},
{
"epoch": 1.89,
"grad_norm": 9.703306236355932,
"learning_rate": 6.626561721069688e-06,
"loss": 0.1599,
"step": 23380
},
{
"epoch": 1.89,
"grad_norm": 7.347162760458088,
"learning_rate": 6.609815957019527e-06,
"loss": 0.1703,
"step": 23400
},
{
"epoch": 1.89,
"grad_norm": 12.113550817300446,
"learning_rate": 6.593080933816866e-06,
"loss": 0.1784,
"step": 23420
},
{
"epoch": 1.89,
"grad_norm": 3.3783362111733486,
"learning_rate": 6.576356704466297e-06,
"loss": 0.1641,
"step": 23440
},
{
"epoch": 1.89,
"grad_norm": 4.068637613130167,
"learning_rate": 6.5596433219382285e-06,
"loss": 0.1436,
"step": 23460
},
{
"epoch": 1.9,
"grad_norm": 6.015225800979343,
"learning_rate": 6.542940839168712e-06,
"loss": 0.1975,
"step": 23480
},
{
"epoch": 1.9,
"grad_norm": 7.2342521376918585,
"learning_rate": 6.5262493090592715e-06,
"loss": 0.1882,
"step": 23500
},
{
"epoch": 1.9,
"grad_norm": 7.619431760731591,
"learning_rate": 6.509568784476753e-06,
"loss": 0.1743,
"step": 23520
},
{
"epoch": 1.9,
"grad_norm": 4.213072170259656,
"learning_rate": 6.4928993182531345e-06,
"loss": 0.1576,
"step": 23540
},
{
"epoch": 1.9,
"grad_norm": 8.226101564425411,
"learning_rate": 6.476240963185369e-06,
"loss": 0.1565,
"step": 23560
},
{
"epoch": 1.9,
"grad_norm": 8.224634189065512,
"learning_rate": 6.459593772035225e-06,
"loss": 0.1835,
"step": 23580
},
{
"epoch": 1.91,
"grad_norm": 6.247185035023087,
"learning_rate": 6.442957797529104e-06,
"loss": 0.1736,
"step": 23600
},
{
"epoch": 1.91,
"grad_norm": 5.9810785123502965,
"learning_rate": 6.426333092357886e-06,
"loss": 0.1615,
"step": 23620
},
{
"epoch": 1.91,
"grad_norm": 11.84481798656128,
"learning_rate": 6.409719709176755e-06,
"loss": 0.1888,
"step": 23640
},
{
"epoch": 1.91,
"grad_norm": 7.506850102533956,
"learning_rate": 6.393117700605034e-06,
"loss": 0.1963,
"step": 23660
},
{
"epoch": 1.91,
"grad_norm": 7.776460084859693,
"learning_rate": 6.376527119226023e-06,
"loss": 0.1485,
"step": 23680
},
{
"epoch": 1.91,
"grad_norm": 6.3180091914164125,
"learning_rate": 6.359948017586827e-06,
"loss": 0.1816,
"step": 23700
},
{
"epoch": 1.91,
"grad_norm": 11.20267927275362,
"learning_rate": 6.343380448198188e-06,
"loss": 0.1652,
"step": 23720
},
{
"epoch": 1.92,
"grad_norm": 6.037469216973155,
"learning_rate": 6.326824463534336e-06,
"loss": 0.1725,
"step": 23740
},
{
"epoch": 1.92,
"grad_norm": 6.261846682645075,
"learning_rate": 6.310280116032791e-06,
"loss": 0.1538,
"step": 23760
},
{
"epoch": 1.92,
"grad_norm": 8.59123942376336,
"learning_rate": 6.293747458094223e-06,
"loss": 0.1737,
"step": 23780
},
{
"epoch": 1.92,
"grad_norm": 10.500292092756426,
"learning_rate": 6.277226542082278e-06,
"loss": 0.1921,
"step": 23800
},
{
"epoch": 1.92,
"grad_norm": 4.477032435385099,
"learning_rate": 6.260717420323409e-06,
"loss": 0.1721,
"step": 23820
},
{
"epoch": 1.92,
"grad_norm": 5.503172566318245,
"learning_rate": 6.244220145106716e-06,
"loss": 0.1668,
"step": 23840
},
{
"epoch": 1.93,
"grad_norm": 13.937620974986471,
"learning_rate": 6.227734768683779e-06,
"loss": 0.1721,
"step": 23860
},
{
"epoch": 1.93,
"grad_norm": 4.6367027070331135,
"learning_rate": 6.211261343268485e-06,
"loss": 0.1765,
"step": 23880
},
{
"epoch": 1.93,
"grad_norm": 5.804661212928365,
"learning_rate": 6.194799921036879e-06,
"loss": 0.1706,
"step": 23900
},
{
"epoch": 1.93,
"grad_norm": 5.838975230670599,
"learning_rate": 6.178350554126979e-06,
"loss": 0.1684,
"step": 23920
},
{
"epoch": 1.93,
"grad_norm": 6.638228830097108,
"learning_rate": 6.161913294638621e-06,
"loss": 0.1848,
"step": 23940
},
{
"epoch": 1.93,
"grad_norm": 5.656968213422785,
"learning_rate": 6.1454881946333e-06,
"loss": 0.1674,
"step": 23960
},
{
"epoch": 1.94,
"grad_norm": 10.46109722035461,
"learning_rate": 6.1290753061339925e-06,
"loss": 0.1631,
"step": 23980
},
{
"epoch": 1.94,
"grad_norm": 7.0280095071831425,
"learning_rate": 6.112674681124998e-06,
"loss": 0.1759,
"step": 24000
},
{
"epoch": 1.94,
"grad_norm": 7.401073079016803,
"learning_rate": 6.09628637155178e-06,
"loss": 0.185,
"step": 24020
},
{
"epoch": 1.94,
"grad_norm": 7.45583155210073,
"learning_rate": 6.079910429320789e-06,
"loss": 0.1907,
"step": 24040
},
{
"epoch": 1.94,
"grad_norm": 5.361896066514213,
"learning_rate": 6.063546906299304e-06,
"loss": 0.1661,
"step": 24060
},
{
"epoch": 1.94,
"grad_norm": 4.390297656906118,
"learning_rate": 6.047195854315274e-06,
"loss": 0.161,
"step": 24080
},
{
"epoch": 1.95,
"grad_norm": 7.3569152997891365,
"learning_rate": 6.030857325157148e-06,
"loss": 0.183,
"step": 24100
},
{
"epoch": 1.95,
"grad_norm": 9.276295993466274,
"learning_rate": 6.014531370573706e-06,
"loss": 0.1585,
"step": 24120
},
{
"epoch": 1.95,
"grad_norm": 6.708581228789846,
"learning_rate": 5.99821804227391e-06,
"loss": 0.1923,
"step": 24140
},
{
"epoch": 1.95,
"grad_norm": 4.565214003093718,
"learning_rate": 5.981917391926716e-06,
"loss": 0.1618,
"step": 24160
},
{
"epoch": 1.95,
"grad_norm": 4.958953220289099,
"learning_rate": 5.9656294711609455e-06,
"loss": 0.1766,
"step": 24180
},
{
"epoch": 1.95,
"grad_norm": 8.765162382650892,
"learning_rate": 5.949354331565087e-06,
"loss": 0.179,
"step": 24200
},
{
"epoch": 1.96,
"grad_norm": 9.338864277112508,
"learning_rate": 5.93309202468715e-06,
"loss": 0.1772,
"step": 24220
},
{
"epoch": 1.96,
"grad_norm": 6.455990799227259,
"learning_rate": 5.916842602034503e-06,
"loss": 0.1764,
"step": 24240
},
{
"epoch": 1.96,
"grad_norm": 5.02536557514161,
"learning_rate": 5.900606115073703e-06,
"loss": 0.1834,
"step": 24260
},
{
"epoch": 1.96,
"grad_norm": 5.6042058818064335,
"learning_rate": 5.884382615230334e-06,
"loss": 0.1667,
"step": 24280
},
{
"epoch": 1.96,
"grad_norm": 3.5189123085762195,
"learning_rate": 5.8681721538888544e-06,
"loss": 0.1572,
"step": 24300
},
{
"epoch": 1.96,
"grad_norm": 5.6992692847099855,
"learning_rate": 5.85197478239242e-06,
"loss": 0.1953,
"step": 24320
},
{
"epoch": 1.96,
"grad_norm": 7.984601221033869,
"learning_rate": 5.835790552042726e-06,
"loss": 0.1821,
"step": 24340
},
{
"epoch": 1.97,
"grad_norm": 5.326599797739027,
"learning_rate": 5.819619514099847e-06,
"loss": 0.1899,
"step": 24360
},
{
"epoch": 1.97,
"grad_norm": 5.785676158799588,
"learning_rate": 5.80346171978208e-06,
"loss": 0.1655,
"step": 24380
},
{
"epoch": 1.97,
"grad_norm": 5.66486255125483,
"learning_rate": 5.78731722026576e-06,
"loss": 0.1787,
"step": 24400
},
{
"epoch": 1.97,
"grad_norm": 3.0399399302219625,
"learning_rate": 5.771186066685136e-06,
"loss": 0.1913,
"step": 24420
},
{
"epoch": 1.97,
"grad_norm": 4.538207717223615,
"learning_rate": 5.755068310132162e-06,
"loss": 0.1486,
"step": 24440
},
{
"epoch": 1.97,
"grad_norm": 8.673712678315818,
"learning_rate": 5.738964001656382e-06,
"loss": 0.1561,
"step": 24460
},
{
"epoch": 1.98,
"grad_norm": 3.9322192187509035,
"learning_rate": 5.722873192264731e-06,
"loss": 0.1594,
"step": 24480
},
{
"epoch": 1.98,
"grad_norm": 5.362419487053112,
"learning_rate": 5.706795932921395e-06,
"loss": 0.1769,
"step": 24500
},
{
"epoch": 1.98,
"grad_norm": 5.243766171745307,
"learning_rate": 5.690732274547639e-06,
"loss": 0.1674,
"step": 24520
},
{
"epoch": 1.98,
"grad_norm": 6.49132016995219,
"learning_rate": 5.674682268021655e-06,
"loss": 0.1795,
"step": 24540
},
{
"epoch": 1.98,
"grad_norm": 8.23819975362725,
"learning_rate": 5.658645964178398e-06,
"loss": 0.1739,
"step": 24560
},
{
"epoch": 1.98,
"grad_norm": 6.291038363918079,
"learning_rate": 5.642623413809408e-06,
"loss": 0.1574,
"step": 24580
},
{
"epoch": 1.99,
"grad_norm": 5.366637178107015,
"learning_rate": 5.626614667662681e-06,
"loss": 0.1694,
"step": 24600
},
{
"epoch": 1.99,
"grad_norm": 9.263601664358115,
"learning_rate": 5.610619776442482e-06,
"loss": 0.1928,
"step": 24620
},
{
"epoch": 1.99,
"grad_norm": 6.177003530809166,
"learning_rate": 5.5946387908091995e-06,
"loss": 0.1578,
"step": 24640
},
{
"epoch": 1.99,
"grad_norm": 4.122671452664669,
"learning_rate": 5.5786717613791675e-06,
"loss": 0.1652,
"step": 24660
},
{
"epoch": 1.99,
"grad_norm": 8.598868326647608,
"learning_rate": 5.562718738724532e-06,
"loss": 0.1829,
"step": 24680
},
{
"epoch": 1.99,
"grad_norm": 5.201437186659346,
"learning_rate": 5.54677977337306e-06,
"loss": 0.1948,
"step": 24700
},
{
"epoch": 2.0,
"grad_norm": 5.546180531185139,
"learning_rate": 5.530854915808009e-06,
"loss": 0.1632,
"step": 24720
},
{
"epoch": 2.0,
"grad_norm": 5.599419815798738,
"learning_rate": 5.514944216467942e-06,
"loss": 0.173,
"step": 24740
},
{
"epoch": 2.0,
"grad_norm": 6.685290390475231,
"learning_rate": 5.4990477257465854e-06,
"loss": 0.1767,
"step": 24760
},
{
"epoch": 2.0,
"grad_norm": 8.819487702938599,
"learning_rate": 5.483165493992667e-06,
"loss": 0.1491,
"step": 24780
},
{
"epoch": 2.0,
"grad_norm": 3.9679892704495603,
"learning_rate": 5.467297571509735e-06,
"loss": 0.1422,
"step": 24800
},
{
"epoch": 2.0,
"grad_norm": 6.793517658152576,
"learning_rate": 5.451444008556042e-06,
"loss": 0.1183,
"step": 24820
},
{
"epoch": 2.01,
"grad_norm": 5.961884869528241,
"learning_rate": 5.435604855344332e-06,
"loss": 0.1284,
"step": 24840
},
{
"epoch": 2.01,
"grad_norm": 7.746334808953981,
"learning_rate": 5.419780162041731e-06,
"loss": 0.1081,
"step": 24860
},
{
"epoch": 2.01,
"grad_norm": 4.4474985013962485,
"learning_rate": 5.4039699787695536e-06,
"loss": 0.1347,
"step": 24880
},
{
"epoch": 2.01,
"grad_norm": 5.856732802707876,
"learning_rate": 5.388174355603166e-06,
"loss": 0.1545,
"step": 24900
},
{
"epoch": 2.01,
"grad_norm": 4.924020644664483,
"learning_rate": 5.372393342571808e-06,
"loss": 0.1499,
"step": 24920
},
{
"epoch": 2.01,
"grad_norm": 5.682977240510656,
"learning_rate": 5.356626989658453e-06,
"loss": 0.1246,
"step": 24940
},
{
"epoch": 2.01,
"grad_norm": 5.859264972925292,
"learning_rate": 5.340875346799646e-06,
"loss": 0.1305,
"step": 24960
},
{
"epoch": 2.02,
"grad_norm": 4.623581539899941,
"learning_rate": 5.325138463885324e-06,
"loss": 0.1264,
"step": 24980
},
{
"epoch": 2.02,
"grad_norm": 7.01967036937822,
"learning_rate": 5.309416390758695e-06,
"loss": 0.1069,
"step": 25000
}
],
"logging_steps": 20,
"max_steps": 37164,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"total_flos": 0.0,
"train_batch_size": 12,
"trial_name": null,
"trial_params": null
}