IeBoytsov's picture
Model save
07f8aba verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 5772,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005197505197505198,
"grad_norm": 37.059080589232245,
"learning_rate": 3.4602076124567476e-08,
"loss": 2.0466,
"step": 1
},
{
"epoch": 0.002598752598752599,
"grad_norm": 37.255614130391415,
"learning_rate": 1.730103806228374e-07,
"loss": 2.0444,
"step": 5
},
{
"epoch": 0.005197505197505198,
"grad_norm": 38.706692454199526,
"learning_rate": 3.460207612456748e-07,
"loss": 2.0231,
"step": 10
},
{
"epoch": 0.007796257796257797,
"grad_norm": 38.51739451223501,
"learning_rate": 5.190311418685121e-07,
"loss": 1.9458,
"step": 15
},
{
"epoch": 0.010395010395010396,
"grad_norm": 10.810954056905189,
"learning_rate": 6.920415224913496e-07,
"loss": 1.842,
"step": 20
},
{
"epoch": 0.012993762993762994,
"grad_norm": 5.592591424491592,
"learning_rate": 8.650519031141868e-07,
"loss": 1.736,
"step": 25
},
{
"epoch": 0.015592515592515593,
"grad_norm": 4.235861784422552,
"learning_rate": 1.0380622837370243e-06,
"loss": 1.6427,
"step": 30
},
{
"epoch": 0.018191268191268192,
"grad_norm": 3.005957983709911,
"learning_rate": 1.2110726643598616e-06,
"loss": 1.4661,
"step": 35
},
{
"epoch": 0.02079002079002079,
"grad_norm": 2.1399033031159416,
"learning_rate": 1.3840830449826992e-06,
"loss": 1.3665,
"step": 40
},
{
"epoch": 0.02338877338877339,
"grad_norm": 2.311859726684216,
"learning_rate": 1.5570934256055365e-06,
"loss": 1.2455,
"step": 45
},
{
"epoch": 0.02598752598752599,
"grad_norm": 2.2333969372651588,
"learning_rate": 1.7301038062283736e-06,
"loss": 1.1655,
"step": 50
},
{
"epoch": 0.028586278586278588,
"grad_norm": 1.4673684609803286,
"learning_rate": 1.9031141868512112e-06,
"loss": 1.0897,
"step": 55
},
{
"epoch": 0.031185031185031187,
"grad_norm": 1.219673178403078,
"learning_rate": 2.0761245674740485e-06,
"loss": 1.0397,
"step": 60
},
{
"epoch": 0.033783783783783786,
"grad_norm": 1.177265012600064,
"learning_rate": 2.249134948096886e-06,
"loss": 0.9918,
"step": 65
},
{
"epoch": 0.036382536382536385,
"grad_norm": 1.1401627512076926,
"learning_rate": 2.4221453287197232e-06,
"loss": 0.9519,
"step": 70
},
{
"epoch": 0.03898128898128898,
"grad_norm": 1.104171712944849,
"learning_rate": 2.5951557093425604e-06,
"loss": 0.9419,
"step": 75
},
{
"epoch": 0.04158004158004158,
"grad_norm": 1.153098727216341,
"learning_rate": 2.7681660899653983e-06,
"loss": 0.9108,
"step": 80
},
{
"epoch": 0.04417879417879418,
"grad_norm": 1.1902270145957274,
"learning_rate": 2.9411764705882355e-06,
"loss": 0.8827,
"step": 85
},
{
"epoch": 0.04677754677754678,
"grad_norm": 1.246541694659634,
"learning_rate": 3.114186851211073e-06,
"loss": 0.8805,
"step": 90
},
{
"epoch": 0.04937629937629938,
"grad_norm": 1.3692651531313507,
"learning_rate": 3.28719723183391e-06,
"loss": 0.872,
"step": 95
},
{
"epoch": 0.05197505197505198,
"grad_norm": 1.2906179501144006,
"learning_rate": 3.4602076124567473e-06,
"loss": 0.8603,
"step": 100
},
{
"epoch": 0.05457380457380458,
"grad_norm": 1.1975871593644642,
"learning_rate": 3.6332179930795853e-06,
"loss": 0.8401,
"step": 105
},
{
"epoch": 0.057172557172557176,
"grad_norm": 1.0702701275027335,
"learning_rate": 3.8062283737024224e-06,
"loss": 0.8599,
"step": 110
},
{
"epoch": 0.059771309771309775,
"grad_norm": 1.2062361616415083,
"learning_rate": 3.9792387543252595e-06,
"loss": 0.8437,
"step": 115
},
{
"epoch": 0.062370062370062374,
"grad_norm": 1.0287364509894605,
"learning_rate": 4.152249134948097e-06,
"loss": 0.822,
"step": 120
},
{
"epoch": 0.06496881496881497,
"grad_norm": 1.3847381271884296,
"learning_rate": 4.325259515570935e-06,
"loss": 0.8193,
"step": 125
},
{
"epoch": 0.06756756756756757,
"grad_norm": 1.2610205489076338,
"learning_rate": 4.498269896193772e-06,
"loss": 0.8202,
"step": 130
},
{
"epoch": 0.07016632016632017,
"grad_norm": 1.2296632878962366,
"learning_rate": 4.67128027681661e-06,
"loss": 0.8231,
"step": 135
},
{
"epoch": 0.07276507276507277,
"grad_norm": 1.2968286734442396,
"learning_rate": 4.8442906574394464e-06,
"loss": 0.8196,
"step": 140
},
{
"epoch": 0.07536382536382537,
"grad_norm": 1.1170405804017387,
"learning_rate": 5.017301038062284e-06,
"loss": 0.8174,
"step": 145
},
{
"epoch": 0.07796257796257797,
"grad_norm": 1.0601393778580994,
"learning_rate": 5.190311418685121e-06,
"loss": 0.8095,
"step": 150
},
{
"epoch": 0.08056133056133057,
"grad_norm": 1.0897540737796731,
"learning_rate": 5.363321799307959e-06,
"loss": 0.7995,
"step": 155
},
{
"epoch": 0.08316008316008316,
"grad_norm": 1.2121810987520705,
"learning_rate": 5.536332179930797e-06,
"loss": 0.8207,
"step": 160
},
{
"epoch": 0.08575883575883576,
"grad_norm": 1.1378189701749455,
"learning_rate": 5.709342560553633e-06,
"loss": 0.7946,
"step": 165
},
{
"epoch": 0.08835758835758836,
"grad_norm": 1.0503529917982035,
"learning_rate": 5.882352941176471e-06,
"loss": 0.8116,
"step": 170
},
{
"epoch": 0.09095634095634096,
"grad_norm": 1.128045536223591,
"learning_rate": 6.055363321799308e-06,
"loss": 0.7943,
"step": 175
},
{
"epoch": 0.09355509355509356,
"grad_norm": 1.0062924807045572,
"learning_rate": 6.228373702422146e-06,
"loss": 0.7908,
"step": 180
},
{
"epoch": 0.09615384615384616,
"grad_norm": 1.0645389567201315,
"learning_rate": 6.401384083044984e-06,
"loss": 0.7961,
"step": 185
},
{
"epoch": 0.09875259875259876,
"grad_norm": 1.1414748404258819,
"learning_rate": 6.57439446366782e-06,
"loss": 0.7847,
"step": 190
},
{
"epoch": 0.10135135135135136,
"grad_norm": 1.205384007751443,
"learning_rate": 6.747404844290658e-06,
"loss": 0.7751,
"step": 195
},
{
"epoch": 0.10395010395010396,
"grad_norm": 1.1367398433720104,
"learning_rate": 6.9204152249134946e-06,
"loss": 0.7919,
"step": 200
},
{
"epoch": 0.10654885654885655,
"grad_norm": 0.9307012296511041,
"learning_rate": 7.093425605536333e-06,
"loss": 0.7901,
"step": 205
},
{
"epoch": 0.10914760914760915,
"grad_norm": 1.0367934940766987,
"learning_rate": 7.2664359861591705e-06,
"loss": 0.7895,
"step": 210
},
{
"epoch": 0.11174636174636175,
"grad_norm": 0.96847574603506,
"learning_rate": 7.439446366782007e-06,
"loss": 0.7883,
"step": 215
},
{
"epoch": 0.11434511434511435,
"grad_norm": 1.1618158896817028,
"learning_rate": 7.612456747404845e-06,
"loss": 0.7849,
"step": 220
},
{
"epoch": 0.11694386694386695,
"grad_norm": 2.5717464884960584,
"learning_rate": 7.785467128027683e-06,
"loss": 0.7826,
"step": 225
},
{
"epoch": 0.11954261954261955,
"grad_norm": 1.0453668836748238,
"learning_rate": 7.958477508650519e-06,
"loss": 0.7682,
"step": 230
},
{
"epoch": 0.12214137214137215,
"grad_norm": 1.0156117659063706,
"learning_rate": 8.131487889273357e-06,
"loss": 0.7622,
"step": 235
},
{
"epoch": 0.12474012474012475,
"grad_norm": 0.9498681526378566,
"learning_rate": 8.304498269896194e-06,
"loss": 0.7861,
"step": 240
},
{
"epoch": 0.12733887733887733,
"grad_norm": 1.0843119677144408,
"learning_rate": 8.477508650519032e-06,
"loss": 0.7804,
"step": 245
},
{
"epoch": 0.12993762993762994,
"grad_norm": 1.1528568447769787,
"learning_rate": 8.65051903114187e-06,
"loss": 0.7639,
"step": 250
},
{
"epoch": 0.13253638253638253,
"grad_norm": 1.1295719141693836,
"learning_rate": 8.823529411764707e-06,
"loss": 0.7816,
"step": 255
},
{
"epoch": 0.13513513513513514,
"grad_norm": 0.978783892464181,
"learning_rate": 8.996539792387544e-06,
"loss": 0.7672,
"step": 260
},
{
"epoch": 0.13773388773388773,
"grad_norm": 0.9379534418690467,
"learning_rate": 9.16955017301038e-06,
"loss": 0.7702,
"step": 265
},
{
"epoch": 0.14033264033264034,
"grad_norm": 1.1416793594082861,
"learning_rate": 9.34256055363322e-06,
"loss": 0.7738,
"step": 270
},
{
"epoch": 0.14293139293139293,
"grad_norm": 0.9977084295945086,
"learning_rate": 9.515570934256057e-06,
"loss": 0.7696,
"step": 275
},
{
"epoch": 0.14553014553014554,
"grad_norm": 1.037149328356884,
"learning_rate": 9.688581314878893e-06,
"loss": 0.7674,
"step": 280
},
{
"epoch": 0.14812889812889812,
"grad_norm": 1.0165544577935077,
"learning_rate": 9.86159169550173e-06,
"loss": 0.7554,
"step": 285
},
{
"epoch": 0.15072765072765074,
"grad_norm": 0.9713706199068332,
"learning_rate": 1.0034602076124568e-05,
"loss": 0.7771,
"step": 290
},
{
"epoch": 0.15332640332640332,
"grad_norm": 1.0519199834853972,
"learning_rate": 1.0207612456747407e-05,
"loss": 0.7652,
"step": 295
},
{
"epoch": 0.15592515592515593,
"grad_norm": 0.8942757233736588,
"learning_rate": 1.0380622837370241e-05,
"loss": 0.754,
"step": 300
},
{
"epoch": 0.15852390852390852,
"grad_norm": 0.9760267256597028,
"learning_rate": 1.055363321799308e-05,
"loss": 0.7597,
"step": 305
},
{
"epoch": 0.16112266112266113,
"grad_norm": 0.8750773264970739,
"learning_rate": 1.0726643598615918e-05,
"loss": 0.7552,
"step": 310
},
{
"epoch": 0.16372141372141372,
"grad_norm": 0.9317612467807546,
"learning_rate": 1.0899653979238756e-05,
"loss": 0.7619,
"step": 315
},
{
"epoch": 0.16632016632016633,
"grad_norm": 0.9585051662580469,
"learning_rate": 1.1072664359861593e-05,
"loss": 0.7562,
"step": 320
},
{
"epoch": 0.16891891891891891,
"grad_norm": 0.9833066117827967,
"learning_rate": 1.124567474048443e-05,
"loss": 0.7668,
"step": 325
},
{
"epoch": 0.17151767151767153,
"grad_norm": 0.9999136205277245,
"learning_rate": 1.1418685121107267e-05,
"loss": 0.7563,
"step": 330
},
{
"epoch": 0.1741164241164241,
"grad_norm": 1.063190427210389,
"learning_rate": 1.1591695501730104e-05,
"loss": 0.7463,
"step": 335
},
{
"epoch": 0.17671517671517672,
"grad_norm": 0.9998506539481437,
"learning_rate": 1.1764705882352942e-05,
"loss": 0.7533,
"step": 340
},
{
"epoch": 0.1793139293139293,
"grad_norm": 0.929168293634566,
"learning_rate": 1.1937716262975781e-05,
"loss": 0.7466,
"step": 345
},
{
"epoch": 0.18191268191268192,
"grad_norm": 1.018693960607738,
"learning_rate": 1.2110726643598615e-05,
"loss": 0.7489,
"step": 350
},
{
"epoch": 0.1845114345114345,
"grad_norm": 0.9657161121572101,
"learning_rate": 1.2283737024221455e-05,
"loss": 0.7639,
"step": 355
},
{
"epoch": 0.18711018711018712,
"grad_norm": 0.9274247630285816,
"learning_rate": 1.2456747404844292e-05,
"loss": 0.7458,
"step": 360
},
{
"epoch": 0.1897089397089397,
"grad_norm": 0.8709049483455183,
"learning_rate": 1.262975778546713e-05,
"loss": 0.7542,
"step": 365
},
{
"epoch": 0.19230769230769232,
"grad_norm": 0.8529475456705145,
"learning_rate": 1.2802768166089967e-05,
"loss": 0.7615,
"step": 370
},
{
"epoch": 0.1949064449064449,
"grad_norm": 0.8834877993689659,
"learning_rate": 1.2975778546712803e-05,
"loss": 0.7555,
"step": 375
},
{
"epoch": 0.19750519750519752,
"grad_norm": 0.8612036241498346,
"learning_rate": 1.314878892733564e-05,
"loss": 0.7455,
"step": 380
},
{
"epoch": 0.2001039501039501,
"grad_norm": 0.972005034702574,
"learning_rate": 1.3321799307958478e-05,
"loss": 0.7335,
"step": 385
},
{
"epoch": 0.20270270270270271,
"grad_norm": 0.8405468852505008,
"learning_rate": 1.3494809688581316e-05,
"loss": 0.7454,
"step": 390
},
{
"epoch": 0.2053014553014553,
"grad_norm": 0.94483984497754,
"learning_rate": 1.3667820069204153e-05,
"loss": 0.7509,
"step": 395
},
{
"epoch": 0.2079002079002079,
"grad_norm": 0.8568496193733218,
"learning_rate": 1.3840830449826989e-05,
"loss": 0.7386,
"step": 400
},
{
"epoch": 0.2104989604989605,
"grad_norm": 0.9305490201344858,
"learning_rate": 1.4013840830449827e-05,
"loss": 0.7325,
"step": 405
},
{
"epoch": 0.2130977130977131,
"grad_norm": 0.8391743588987977,
"learning_rate": 1.4186851211072666e-05,
"loss": 0.7394,
"step": 410
},
{
"epoch": 0.2156964656964657,
"grad_norm": 0.8904148072363134,
"learning_rate": 1.4359861591695503e-05,
"loss": 0.7659,
"step": 415
},
{
"epoch": 0.2182952182952183,
"grad_norm": 0.9494764558208273,
"learning_rate": 1.4532871972318341e-05,
"loss": 0.7303,
"step": 420
},
{
"epoch": 0.2208939708939709,
"grad_norm": 0.8729324500601073,
"learning_rate": 1.4705882352941179e-05,
"loss": 0.7464,
"step": 425
},
{
"epoch": 0.2234927234927235,
"grad_norm": 0.9426426724996545,
"learning_rate": 1.4878892733564014e-05,
"loss": 0.7425,
"step": 430
},
{
"epoch": 0.2260914760914761,
"grad_norm": 0.8397393999023687,
"learning_rate": 1.5051903114186852e-05,
"loss": 0.7225,
"step": 435
},
{
"epoch": 0.2286902286902287,
"grad_norm": 0.822001337030522,
"learning_rate": 1.522491349480969e-05,
"loss": 0.7514,
"step": 440
},
{
"epoch": 0.2312889812889813,
"grad_norm": 0.8196560129735319,
"learning_rate": 1.539792387543253e-05,
"loss": 0.7455,
"step": 445
},
{
"epoch": 0.2338877338877339,
"grad_norm": 0.9001216187487245,
"learning_rate": 1.5570934256055366e-05,
"loss": 0.7523,
"step": 450
},
{
"epoch": 0.23648648648648649,
"grad_norm": 0.9230142554852074,
"learning_rate": 1.57439446366782e-05,
"loss": 0.7569,
"step": 455
},
{
"epoch": 0.2390852390852391,
"grad_norm": 0.8290174186484409,
"learning_rate": 1.5916955017301038e-05,
"loss": 0.7428,
"step": 460
},
{
"epoch": 0.24168399168399168,
"grad_norm": 0.829715213003188,
"learning_rate": 1.6089965397923876e-05,
"loss": 0.7457,
"step": 465
},
{
"epoch": 0.2442827442827443,
"grad_norm": 0.8794988465121758,
"learning_rate": 1.6262975778546713e-05,
"loss": 0.7427,
"step": 470
},
{
"epoch": 0.24688149688149688,
"grad_norm": 0.860878867890723,
"learning_rate": 1.6435986159169554e-05,
"loss": 0.727,
"step": 475
},
{
"epoch": 0.2494802494802495,
"grad_norm": 0.8488363967170557,
"learning_rate": 1.6608996539792388e-05,
"loss": 0.7341,
"step": 480
},
{
"epoch": 0.2520790020790021,
"grad_norm": 0.9024495405776305,
"learning_rate": 1.6782006920415226e-05,
"loss": 0.7445,
"step": 485
},
{
"epoch": 0.25467775467775466,
"grad_norm": 0.8297075062381525,
"learning_rate": 1.6955017301038063e-05,
"loss": 0.7618,
"step": 490
},
{
"epoch": 0.25727650727650725,
"grad_norm": 0.9522709115263103,
"learning_rate": 1.71280276816609e-05,
"loss": 0.7524,
"step": 495
},
{
"epoch": 0.2598752598752599,
"grad_norm": 0.8862001527957881,
"learning_rate": 1.730103806228374e-05,
"loss": 0.7392,
"step": 500
},
{
"epoch": 0.2624740124740125,
"grad_norm": 0.8190856457278167,
"learning_rate": 1.7474048442906576e-05,
"loss": 0.7348,
"step": 505
},
{
"epoch": 0.26507276507276506,
"grad_norm": 0.8084717872755484,
"learning_rate": 1.7647058823529414e-05,
"loss": 0.7555,
"step": 510
},
{
"epoch": 0.26767151767151764,
"grad_norm": 0.8143004262657276,
"learning_rate": 1.782006920415225e-05,
"loss": 0.7493,
"step": 515
},
{
"epoch": 0.2702702702702703,
"grad_norm": 0.7439901950641524,
"learning_rate": 1.799307958477509e-05,
"loss": 0.7264,
"step": 520
},
{
"epoch": 0.27286902286902287,
"grad_norm": 0.7802119529056604,
"learning_rate": 1.8166089965397926e-05,
"loss": 0.7484,
"step": 525
},
{
"epoch": 0.27546777546777546,
"grad_norm": 0.7671220875794853,
"learning_rate": 1.833910034602076e-05,
"loss": 0.7365,
"step": 530
},
{
"epoch": 0.27806652806652804,
"grad_norm": 0.8013129805690585,
"learning_rate": 1.8512110726643598e-05,
"loss": 0.7586,
"step": 535
},
{
"epoch": 0.2806652806652807,
"grad_norm": 0.7731421991496061,
"learning_rate": 1.868512110726644e-05,
"loss": 0.7521,
"step": 540
},
{
"epoch": 0.28326403326403327,
"grad_norm": 0.8183545102345747,
"learning_rate": 1.8858131487889276e-05,
"loss": 0.7379,
"step": 545
},
{
"epoch": 0.28586278586278585,
"grad_norm": 0.761114380449014,
"learning_rate": 1.9031141868512114e-05,
"loss": 0.7489,
"step": 550
},
{
"epoch": 0.28846153846153844,
"grad_norm": 0.797967905949635,
"learning_rate": 1.9204152249134948e-05,
"loss": 0.7475,
"step": 555
},
{
"epoch": 0.2910602910602911,
"grad_norm": 0.8141772027308778,
"learning_rate": 1.9377162629757786e-05,
"loss": 0.7403,
"step": 560
},
{
"epoch": 0.29365904365904366,
"grad_norm": 1.0002732271715242,
"learning_rate": 1.9550173010380623e-05,
"loss": 0.7446,
"step": 565
},
{
"epoch": 0.29625779625779625,
"grad_norm": 0.7424317625579876,
"learning_rate": 1.972318339100346e-05,
"loss": 0.7432,
"step": 570
},
{
"epoch": 0.29885654885654883,
"grad_norm": 0.7975265418685308,
"learning_rate": 1.98961937716263e-05,
"loss": 0.7425,
"step": 575
},
{
"epoch": 0.30145530145530147,
"grad_norm": 0.8527920792318469,
"learning_rate": 1.9999992683122277e-05,
"loss": 0.7313,
"step": 580
},
{
"epoch": 0.30405405405405406,
"grad_norm": 0.7826703424284943,
"learning_rate": 1.9999910368370826e-05,
"loss": 0.7404,
"step": 585
},
{
"epoch": 0.30665280665280664,
"grad_norm": 0.7942647670210833,
"learning_rate": 1.9999736593526133e-05,
"loss": 0.7263,
"step": 590
},
{
"epoch": 0.3092515592515592,
"grad_norm": 0.7552220975281737,
"learning_rate": 1.999947136017756e-05,
"loss": 0.7353,
"step": 595
},
{
"epoch": 0.31185031185031187,
"grad_norm": 0.7790597350916263,
"learning_rate": 1.9999114670750955e-05,
"loss": 0.7478,
"step": 600
},
{
"epoch": 0.31444906444906445,
"grad_norm": 0.7982754500449706,
"learning_rate": 1.9998666528508632e-05,
"loss": 0.7414,
"step": 605
},
{
"epoch": 0.31704781704781704,
"grad_norm": 0.8159770553033799,
"learning_rate": 1.9998126937549343e-05,
"loss": 0.7285,
"step": 610
},
{
"epoch": 0.3196465696465696,
"grad_norm": 0.8888821616512309,
"learning_rate": 1.9997495902808233e-05,
"loss": 0.751,
"step": 615
},
{
"epoch": 0.32224532224532226,
"grad_norm": 0.7544060206964511,
"learning_rate": 1.9996773430056806e-05,
"loss": 0.7385,
"step": 620
},
{
"epoch": 0.32484407484407485,
"grad_norm": 0.7895944868586088,
"learning_rate": 1.9995959525902856e-05,
"loss": 0.7369,
"step": 625
},
{
"epoch": 0.32744282744282743,
"grad_norm": 0.7602727085172243,
"learning_rate": 1.999505419779044e-05,
"loss": 0.757,
"step": 630
},
{
"epoch": 0.33004158004158,
"grad_norm": 0.8764699729246701,
"learning_rate": 1.9994057453999754e-05,
"loss": 0.738,
"step": 635
},
{
"epoch": 0.33264033264033266,
"grad_norm": 0.7647288391752125,
"learning_rate": 1.9992969303647124e-05,
"loss": 0.7478,
"step": 640
},
{
"epoch": 0.33523908523908524,
"grad_norm": 0.7069726215488147,
"learning_rate": 1.999178975668486e-05,
"loss": 0.7149,
"step": 645
},
{
"epoch": 0.33783783783783783,
"grad_norm": 0.6497220082269107,
"learning_rate": 1.9990518823901213e-05,
"loss": 0.7496,
"step": 650
},
{
"epoch": 0.3404365904365904,
"grad_norm": 0.6610573730423013,
"learning_rate": 1.9989156516920248e-05,
"loss": 0.7297,
"step": 655
},
{
"epoch": 0.34303534303534305,
"grad_norm": 0.6668607876517594,
"learning_rate": 1.9987702848201748e-05,
"loss": 0.7193,
"step": 660
},
{
"epoch": 0.34563409563409564,
"grad_norm": 0.7860493254567829,
"learning_rate": 1.99861578310411e-05,
"loss": 0.7374,
"step": 665
},
{
"epoch": 0.3482328482328482,
"grad_norm": 0.8925562926124014,
"learning_rate": 1.9984521479569176e-05,
"loss": 0.7237,
"step": 670
},
{
"epoch": 0.3508316008316008,
"grad_norm": 0.7672501463240459,
"learning_rate": 1.9982793808752193e-05,
"loss": 0.7306,
"step": 675
},
{
"epoch": 0.35343035343035345,
"grad_norm": 0.7502385743686751,
"learning_rate": 1.9980974834391583e-05,
"loss": 0.7406,
"step": 680
},
{
"epoch": 0.35602910602910603,
"grad_norm": 0.7564023469276626,
"learning_rate": 1.997906457312386e-05,
"loss": 0.7354,
"step": 685
},
{
"epoch": 0.3586278586278586,
"grad_norm": 0.7147365409493106,
"learning_rate": 1.9977063042420438e-05,
"loss": 0.7312,
"step": 690
},
{
"epoch": 0.3612266112266112,
"grad_norm": 0.8321336652388966,
"learning_rate": 1.9974970260587507e-05,
"loss": 0.7364,
"step": 695
},
{
"epoch": 0.36382536382536385,
"grad_norm": 0.6733104515770179,
"learning_rate": 1.9972786246765832e-05,
"loss": 0.7273,
"step": 700
},
{
"epoch": 0.36642411642411643,
"grad_norm": 0.6923993722045795,
"learning_rate": 1.9970511020930612e-05,
"loss": 0.7259,
"step": 705
},
{
"epoch": 0.369022869022869,
"grad_norm": 0.7106843030691585,
"learning_rate": 1.9968144603891272e-05,
"loss": 0.7409,
"step": 710
},
{
"epoch": 0.3716216216216216,
"grad_norm": 0.6074610563242314,
"learning_rate": 1.9965687017291268e-05,
"loss": 0.7237,
"step": 715
},
{
"epoch": 0.37422037422037424,
"grad_norm": 0.6798181846974808,
"learning_rate": 1.9963138283607918e-05,
"loss": 0.7189,
"step": 720
},
{
"epoch": 0.3768191268191268,
"grad_norm": 0.7233959402973988,
"learning_rate": 1.996049842615217e-05,
"loss": 0.7524,
"step": 725
},
{
"epoch": 0.3794178794178794,
"grad_norm": 0.7818955262414797,
"learning_rate": 1.9957767469068405e-05,
"loss": 0.7259,
"step": 730
},
{
"epoch": 0.382016632016632,
"grad_norm": 0.7248772563760029,
"learning_rate": 1.9954945437334204e-05,
"loss": 0.7312,
"step": 735
},
{
"epoch": 0.38461538461538464,
"grad_norm": 0.7079790287253178,
"learning_rate": 1.9952032356760125e-05,
"loss": 0.7041,
"step": 740
},
{
"epoch": 0.3872141372141372,
"grad_norm": 0.7390341417388404,
"learning_rate": 1.994902825398947e-05,
"loss": 0.7133,
"step": 745
},
{
"epoch": 0.3898128898128898,
"grad_norm": 0.8111822578128921,
"learning_rate": 1.9945933156498043e-05,
"loss": 0.729,
"step": 750
},
{
"epoch": 0.3924116424116424,
"grad_norm": 0.7769690688975751,
"learning_rate": 1.9942747092593877e-05,
"loss": 0.715,
"step": 755
},
{
"epoch": 0.39501039501039503,
"grad_norm": 0.6870020477467483,
"learning_rate": 1.9939470091417012e-05,
"loss": 0.7132,
"step": 760
},
{
"epoch": 0.3976091476091476,
"grad_norm": 0.6911704853875393,
"learning_rate": 1.99361021829392e-05,
"loss": 0.7206,
"step": 765
},
{
"epoch": 0.4002079002079002,
"grad_norm": 0.6600143593403244,
"learning_rate": 1.993264339796363e-05,
"loss": 0.7145,
"step": 770
},
{
"epoch": 0.4028066528066528,
"grad_norm": 0.6726432015084747,
"learning_rate": 1.992909376812468e-05,
"loss": 0.739,
"step": 775
},
{
"epoch": 0.40540540540540543,
"grad_norm": 0.70615782283502,
"learning_rate": 1.9925453325887574e-05,
"loss": 0.7222,
"step": 780
},
{
"epoch": 0.408004158004158,
"grad_norm": 0.5887807785936404,
"learning_rate": 1.992172210454814e-05,
"loss": 0.7221,
"step": 785
},
{
"epoch": 0.4106029106029106,
"grad_norm": 0.6357715206079563,
"learning_rate": 1.991790013823246e-05,
"loss": 0.717,
"step": 790
},
{
"epoch": 0.4132016632016632,
"grad_norm": 0.6710571738289492,
"learning_rate": 1.9913987461896597e-05,
"loss": 0.7299,
"step": 795
},
{
"epoch": 0.4158004158004158,
"grad_norm": 0.66534846887862,
"learning_rate": 1.990998411132624e-05,
"loss": 0.719,
"step": 800
},
{
"epoch": 0.4183991683991684,
"grad_norm": 0.6658127042254826,
"learning_rate": 1.9905890123136396e-05,
"loss": 0.7156,
"step": 805
},
{
"epoch": 0.420997920997921,
"grad_norm": 0.7461519732525459,
"learning_rate": 1.990170553477106e-05,
"loss": 0.7281,
"step": 810
},
{
"epoch": 0.4235966735966736,
"grad_norm": 0.7960568157470115,
"learning_rate": 1.9897430384502857e-05,
"loss": 0.7229,
"step": 815
},
{
"epoch": 0.4261954261954262,
"grad_norm": 0.7377717323529744,
"learning_rate": 1.9893064711432702e-05,
"loss": 0.7207,
"step": 820
},
{
"epoch": 0.4287941787941788,
"grad_norm": 0.6333020229736416,
"learning_rate": 1.988860855548944e-05,
"loss": 0.7104,
"step": 825
},
{
"epoch": 0.4313929313929314,
"grad_norm": 0.6099863945288464,
"learning_rate": 1.988406195742948e-05,
"loss": 0.7203,
"step": 830
},
{
"epoch": 0.433991683991684,
"grad_norm": 0.6785164904650527,
"learning_rate": 1.987942495883642e-05,
"loss": 0.711,
"step": 835
},
{
"epoch": 0.4365904365904366,
"grad_norm": 0.6622509941324428,
"learning_rate": 1.9874697602120682e-05,
"loss": 0.7325,
"step": 840
},
{
"epoch": 0.4391891891891892,
"grad_norm": 0.6613983198156271,
"learning_rate": 1.986987993051909e-05,
"loss": 0.7233,
"step": 845
},
{
"epoch": 0.4417879417879418,
"grad_norm": 0.6956512036405856,
"learning_rate": 1.9864971988094515e-05,
"loss": 0.7207,
"step": 850
},
{
"epoch": 0.44438669438669437,
"grad_norm": 0.6797121388808018,
"learning_rate": 1.9859973819735443e-05,
"loss": 0.7359,
"step": 855
},
{
"epoch": 0.446985446985447,
"grad_norm": 0.6634402820404799,
"learning_rate": 1.9854885471155586e-05,
"loss": 0.7094,
"step": 860
},
{
"epoch": 0.4495841995841996,
"grad_norm": 0.6337794885487144,
"learning_rate": 1.9849706988893433e-05,
"loss": 0.7276,
"step": 865
},
{
"epoch": 0.4521829521829522,
"grad_norm": 0.6734603186331721,
"learning_rate": 1.9844438420311863e-05,
"loss": 0.7142,
"step": 870
},
{
"epoch": 0.45478170478170477,
"grad_norm": 0.7066192892075979,
"learning_rate": 1.9839079813597687e-05,
"loss": 0.7149,
"step": 875
},
{
"epoch": 0.4573804573804574,
"grad_norm": 0.6964600685285819,
"learning_rate": 1.9833631217761204e-05,
"loss": 0.7281,
"step": 880
},
{
"epoch": 0.45997920997921,
"grad_norm": 0.7109456157271579,
"learning_rate": 1.9828092682635774e-05,
"loss": 0.7332,
"step": 885
},
{
"epoch": 0.4625779625779626,
"grad_norm": 0.669236169004855,
"learning_rate": 1.9822464258877345e-05,
"loss": 0.7293,
"step": 890
},
{
"epoch": 0.46517671517671516,
"grad_norm": 0.6046679594816758,
"learning_rate": 1.9816745997963996e-05,
"loss": 0.706,
"step": 895
},
{
"epoch": 0.4677754677754678,
"grad_norm": 0.6553388635341802,
"learning_rate": 1.981093795219546e-05,
"loss": 0.7136,
"step": 900
},
{
"epoch": 0.4703742203742204,
"grad_norm": 0.6699423752938592,
"learning_rate": 1.980504017469265e-05,
"loss": 0.7056,
"step": 905
},
{
"epoch": 0.47297297297297297,
"grad_norm": 0.637041202100537,
"learning_rate": 1.9799052719397188e-05,
"loss": 0.7221,
"step": 910
},
{
"epoch": 0.47557172557172556,
"grad_norm": 0.657103082344547,
"learning_rate": 1.979297564107088e-05,
"loss": 0.7271,
"step": 915
},
{
"epoch": 0.4781704781704782,
"grad_norm": 0.6885064889983316,
"learning_rate": 1.978680899529524e-05,
"loss": 0.7159,
"step": 920
},
{
"epoch": 0.4807692307692308,
"grad_norm": 0.7043512832125569,
"learning_rate": 1.9780552838470976e-05,
"loss": 0.7057,
"step": 925
},
{
"epoch": 0.48336798336798337,
"grad_norm": 0.6627639804543833,
"learning_rate": 1.977420722781746e-05,
"loss": 0.7194,
"step": 930
},
{
"epoch": 0.48596673596673595,
"grad_norm": 0.7069767251302125,
"learning_rate": 1.976777222137224e-05,
"loss": 0.7144,
"step": 935
},
{
"epoch": 0.4885654885654886,
"grad_norm": 0.6090332104645865,
"learning_rate": 1.9761247877990465e-05,
"loss": 0.7161,
"step": 940
},
{
"epoch": 0.4911642411642412,
"grad_norm": 0.7085090978015706,
"learning_rate": 1.9754634257344376e-05,
"loss": 0.733,
"step": 945
},
{
"epoch": 0.49376299376299376,
"grad_norm": 0.6858008371045625,
"learning_rate": 1.9747931419922756e-05,
"loss": 0.7271,
"step": 950
},
{
"epoch": 0.49636174636174635,
"grad_norm": 0.6543468152194417,
"learning_rate": 1.974113942703036e-05,
"loss": 0.7052,
"step": 955
},
{
"epoch": 0.498960498960499,
"grad_norm": 0.7013937541029002,
"learning_rate": 1.9734258340787376e-05,
"loss": 0.7233,
"step": 960
},
{
"epoch": 0.5015592515592515,
"grad_norm": 0.6660787930797433,
"learning_rate": 1.9727288224128852e-05,
"loss": 0.7196,
"step": 965
},
{
"epoch": 0.5041580041580042,
"grad_norm": 0.6541474437978503,
"learning_rate": 1.972022914080411e-05,
"loss": 0.7061,
"step": 970
},
{
"epoch": 0.5067567567567568,
"grad_norm": 0.66883512467633,
"learning_rate": 1.971308115537617e-05,
"loss": 0.7146,
"step": 975
},
{
"epoch": 0.5093555093555093,
"grad_norm": 0.6381037219289445,
"learning_rate": 1.970584433322116e-05,
"loss": 0.7087,
"step": 980
},
{
"epoch": 0.511954261954262,
"grad_norm": 1.1110934200883047,
"learning_rate": 1.969851874052771e-05,
"loss": 0.73,
"step": 985
},
{
"epoch": 0.5145530145530145,
"grad_norm": 0.6310880004216817,
"learning_rate": 1.969110444429637e-05,
"loss": 0.7183,
"step": 990
},
{
"epoch": 0.5171517671517671,
"grad_norm": 0.6410220872400427,
"learning_rate": 1.9683601512338963e-05,
"loss": 0.7086,
"step": 995
},
{
"epoch": 0.5197505197505198,
"grad_norm": 0.5996299242899708,
"learning_rate": 1.9676010013277994e-05,
"loss": 0.7267,
"step": 1000
},
{
"epoch": 0.5223492723492723,
"grad_norm": 0.6590949790761541,
"learning_rate": 1.9668330016546004e-05,
"loss": 0.7374,
"step": 1005
},
{
"epoch": 0.524948024948025,
"grad_norm": 0.6221180388276606,
"learning_rate": 1.9660561592384946e-05,
"loss": 0.7249,
"step": 1010
},
{
"epoch": 0.5275467775467776,
"grad_norm": 0.6026562255053431,
"learning_rate": 1.965270481184553e-05,
"loss": 0.7092,
"step": 1015
},
{
"epoch": 0.5301455301455301,
"grad_norm": 0.6076685489615162,
"learning_rate": 1.9644759746786598e-05,
"loss": 0.7144,
"step": 1020
},
{
"epoch": 0.5327442827442828,
"grad_norm": 0.6466092379791593,
"learning_rate": 1.9636726469874437e-05,
"loss": 0.7021,
"step": 1025
},
{
"epoch": 0.5353430353430353,
"grad_norm": 0.6614878159031965,
"learning_rate": 1.962860505458213e-05,
"loss": 0.7147,
"step": 1030
},
{
"epoch": 0.5379417879417879,
"grad_norm": 0.642038696775677,
"learning_rate": 1.962039557518888e-05,
"loss": 0.7064,
"step": 1035
},
{
"epoch": 0.5405405405405406,
"grad_norm": 0.6053359618387539,
"learning_rate": 1.961209810677934e-05,
"loss": 0.7103,
"step": 1040
},
{
"epoch": 0.5431392931392931,
"grad_norm": 0.6208076453451457,
"learning_rate": 1.960371272524291e-05,
"loss": 0.717,
"step": 1045
},
{
"epoch": 0.5457380457380457,
"grad_norm": 0.6668283857181149,
"learning_rate": 1.9595239507273058e-05,
"loss": 0.7048,
"step": 1050
},
{
"epoch": 0.5483367983367984,
"grad_norm": 0.6229130725064413,
"learning_rate": 1.9586678530366607e-05,
"loss": 0.7159,
"step": 1055
},
{
"epoch": 0.5509355509355509,
"grad_norm": 0.5933645289790093,
"learning_rate": 1.9578029872823038e-05,
"loss": 0.7131,
"step": 1060
},
{
"epoch": 0.5535343035343036,
"grad_norm": 0.5857860624157782,
"learning_rate": 1.9569293613743753e-05,
"loss": 0.7037,
"step": 1065
},
{
"epoch": 0.5561330561330561,
"grad_norm": 0.6425150123453736,
"learning_rate": 1.9560469833031383e-05,
"loss": 0.7098,
"step": 1070
},
{
"epoch": 0.5587318087318087,
"grad_norm": 0.619116481041439,
"learning_rate": 1.955155861138903e-05,
"loss": 0.7176,
"step": 1075
},
{
"epoch": 0.5613305613305614,
"grad_norm": 0.635662450753945,
"learning_rate": 1.9542560030319543e-05,
"loss": 0.7104,
"step": 1080
},
{
"epoch": 0.5639293139293139,
"grad_norm": 0.6273126414001168,
"learning_rate": 1.9533474172124763e-05,
"loss": 0.7144,
"step": 1085
},
{
"epoch": 0.5665280665280665,
"grad_norm": 0.5746605470244842,
"learning_rate": 1.952430111990478e-05,
"loss": 0.7058,
"step": 1090
},
{
"epoch": 0.5691268191268192,
"grad_norm": 0.616041790905867,
"learning_rate": 1.9515040957557162e-05,
"loss": 0.7144,
"step": 1095
},
{
"epoch": 0.5717255717255717,
"grad_norm": 0.5604195549287683,
"learning_rate": 1.950569376977621e-05,
"loss": 0.7045,
"step": 1100
},
{
"epoch": 0.5743243243243243,
"grad_norm": 0.5873428291768331,
"learning_rate": 1.9496259642052146e-05,
"loss": 0.7121,
"step": 1105
},
{
"epoch": 0.5769230769230769,
"grad_norm": 0.6316260348752082,
"learning_rate": 1.9486738660670373e-05,
"loss": 0.7147,
"step": 1110
},
{
"epoch": 0.5795218295218295,
"grad_norm": 0.5874076313405716,
"learning_rate": 1.9477130912710648e-05,
"loss": 0.7279,
"step": 1115
},
{
"epoch": 0.5821205821205822,
"grad_norm": 0.6676704495322479,
"learning_rate": 1.9467436486046317e-05,
"loss": 0.7103,
"step": 1120
},
{
"epoch": 0.5847193347193347,
"grad_norm": 0.5827519526288305,
"learning_rate": 1.9457655469343482e-05,
"loss": 0.7014,
"step": 1125
},
{
"epoch": 0.5873180873180873,
"grad_norm": 0.6037871799524279,
"learning_rate": 1.944778795206023e-05,
"loss": 0.7053,
"step": 1130
},
{
"epoch": 0.58991683991684,
"grad_norm": 0.5712930594682987,
"learning_rate": 1.9437834024445762e-05,
"loss": 0.7177,
"step": 1135
},
{
"epoch": 0.5925155925155925,
"grad_norm": 0.5970793241519367,
"learning_rate": 1.9427793777539615e-05,
"loss": 0.7127,
"step": 1140
},
{
"epoch": 0.5951143451143451,
"grad_norm": 0.6209626687697077,
"learning_rate": 1.9417667303170803e-05,
"loss": 0.7063,
"step": 1145
},
{
"epoch": 0.5977130977130977,
"grad_norm": 0.606055879020448,
"learning_rate": 1.940745469395698e-05,
"loss": 0.695,
"step": 1150
},
{
"epoch": 0.6003118503118503,
"grad_norm": 0.6188361477212587,
"learning_rate": 1.9397156043303608e-05,
"loss": 0.6966,
"step": 1155
},
{
"epoch": 0.6029106029106029,
"grad_norm": 0.6290824936609826,
"learning_rate": 1.9386771445403086e-05,
"loss": 0.7031,
"step": 1160
},
{
"epoch": 0.6055093555093555,
"grad_norm": 0.6122110814030293,
"learning_rate": 1.9376300995233894e-05,
"loss": 0.7083,
"step": 1165
},
{
"epoch": 0.6081081081081081,
"grad_norm": 0.6273881899574172,
"learning_rate": 1.9365744788559725e-05,
"loss": 0.7023,
"step": 1170
},
{
"epoch": 0.6107068607068608,
"grad_norm": 0.5897253804349057,
"learning_rate": 1.9355102921928606e-05,
"loss": 0.7141,
"step": 1175
},
{
"epoch": 0.6133056133056133,
"grad_norm": 0.6492672708845987,
"learning_rate": 1.9344375492672024e-05,
"loss": 0.7097,
"step": 1180
},
{
"epoch": 0.6159043659043659,
"grad_norm": 0.5984352418529706,
"learning_rate": 1.9333562598904027e-05,
"loss": 0.7068,
"step": 1185
},
{
"epoch": 0.6185031185031185,
"grad_norm": 0.6401953574979402,
"learning_rate": 1.9322664339520328e-05,
"loss": 0.7007,
"step": 1190
},
{
"epoch": 0.6211018711018711,
"grad_norm": 0.6292156047444384,
"learning_rate": 1.93116808141974e-05,
"loss": 0.7114,
"step": 1195
},
{
"epoch": 0.6237006237006237,
"grad_norm": 0.6298280430381119,
"learning_rate": 1.9300612123391574e-05,
"loss": 0.7224,
"step": 1200
},
{
"epoch": 0.6262993762993763,
"grad_norm": 0.5647290693137603,
"learning_rate": 1.92894583683381e-05,
"loss": 0.7029,
"step": 1205
},
{
"epoch": 0.6288981288981289,
"grad_norm": 0.5843497274151073,
"learning_rate": 1.927821965105024e-05,
"loss": 0.6935,
"step": 1210
},
{
"epoch": 0.6314968814968815,
"grad_norm": 0.5742329412422685,
"learning_rate": 1.9266896074318335e-05,
"loss": 0.6921,
"step": 1215
},
{
"epoch": 0.6340956340956341,
"grad_norm": 0.6198304134928966,
"learning_rate": 1.925548774170885e-05,
"loss": 0.7022,
"step": 1220
},
{
"epoch": 0.6366943866943867,
"grad_norm": 0.5927355008313566,
"learning_rate": 1.924399475756343e-05,
"loss": 0.7043,
"step": 1225
},
{
"epoch": 0.6392931392931392,
"grad_norm": 0.5675856487929543,
"learning_rate": 1.9232417226997964e-05,
"loss": 0.6979,
"step": 1230
},
{
"epoch": 0.6418918918918919,
"grad_norm": 0.5801729136751573,
"learning_rate": 1.9220755255901604e-05,
"loss": 0.7128,
"step": 1235
},
{
"epoch": 0.6444906444906445,
"grad_norm": 0.5455856005670234,
"learning_rate": 1.92090089509358e-05,
"loss": 0.7154,
"step": 1240
},
{
"epoch": 0.6470893970893971,
"grad_norm": 0.578411372283767,
"learning_rate": 1.9197178419533328e-05,
"loss": 0.726,
"step": 1245
},
{
"epoch": 0.6496881496881497,
"grad_norm": 0.6165732640247198,
"learning_rate": 1.918526376989731e-05,
"loss": 0.7097,
"step": 1250
},
{
"epoch": 0.6522869022869023,
"grad_norm": 0.579722849123064,
"learning_rate": 1.9173265111000218e-05,
"loss": 0.7181,
"step": 1255
},
{
"epoch": 0.6548856548856549,
"grad_norm": 0.6384864268465269,
"learning_rate": 1.9161182552582885e-05,
"loss": 0.7048,
"step": 1260
},
{
"epoch": 0.6574844074844075,
"grad_norm": 0.5442756986247173,
"learning_rate": 1.9149016205153494e-05,
"loss": 0.6983,
"step": 1265
},
{
"epoch": 0.66008316008316,
"grad_norm": 0.5876328008368029,
"learning_rate": 1.9136766179986566e-05,
"loss": 0.7058,
"step": 1270
},
{
"epoch": 0.6626819126819127,
"grad_norm": 0.556923286518879,
"learning_rate": 1.9124432589121945e-05,
"loss": 0.7048,
"step": 1275
},
{
"epoch": 0.6652806652806653,
"grad_norm": 0.5614004903256652,
"learning_rate": 1.9112015545363793e-05,
"loss": 0.703,
"step": 1280
},
{
"epoch": 0.6678794178794178,
"grad_norm": 0.6029085739466059,
"learning_rate": 1.9099515162279515e-05,
"loss": 0.7149,
"step": 1285
},
{
"epoch": 0.6704781704781705,
"grad_norm": 0.5542833475447663,
"learning_rate": 1.9086931554198756e-05,
"loss": 0.7059,
"step": 1290
},
{
"epoch": 0.6730769230769231,
"grad_norm": 0.596418111214614,
"learning_rate": 1.907426483621235e-05,
"loss": 0.7187,
"step": 1295
},
{
"epoch": 0.6756756756756757,
"grad_norm": 0.5964335487077739,
"learning_rate": 1.9061515124171254e-05,
"loss": 0.7023,
"step": 1300
},
{
"epoch": 0.6782744282744283,
"grad_norm": 0.6527135172773815,
"learning_rate": 1.90486825346855e-05,
"loss": 0.6985,
"step": 1305
},
{
"epoch": 0.6808731808731808,
"grad_norm": 0.5854908036035414,
"learning_rate": 1.9035767185123118e-05,
"loss": 0.7097,
"step": 1310
},
{
"epoch": 0.6834719334719335,
"grad_norm": 0.5630331628185049,
"learning_rate": 1.9022769193609077e-05,
"loss": 0.6973,
"step": 1315
},
{
"epoch": 0.6860706860706861,
"grad_norm": 0.5872323843899289,
"learning_rate": 1.900968867902419e-05,
"loss": 0.7069,
"step": 1320
},
{
"epoch": 0.6886694386694386,
"grad_norm": 0.5845474538391455,
"learning_rate": 1.899652576100405e-05,
"loss": 0.7169,
"step": 1325
},
{
"epoch": 0.6912681912681913,
"grad_norm": 0.6164999248623418,
"learning_rate": 1.8983280559937896e-05,
"loss": 0.7005,
"step": 1330
},
{
"epoch": 0.6938669438669439,
"grad_norm": 0.6124510306800306,
"learning_rate": 1.896995319696755e-05,
"loss": 0.701,
"step": 1335
},
{
"epoch": 0.6964656964656964,
"grad_norm": 0.6267010331850633,
"learning_rate": 1.8956543793986287e-05,
"loss": 0.7164,
"step": 1340
},
{
"epoch": 0.6990644490644491,
"grad_norm": 0.5961399562877898,
"learning_rate": 1.8943052473637734e-05,
"loss": 0.7213,
"step": 1345
},
{
"epoch": 0.7016632016632016,
"grad_norm": 0.6174817199293855,
"learning_rate": 1.8929479359314742e-05,
"loss": 0.6985,
"step": 1350
},
{
"epoch": 0.7042619542619543,
"grad_norm": 0.5851247926140993,
"learning_rate": 1.891582457515825e-05,
"loss": 0.6935,
"step": 1355
},
{
"epoch": 0.7068607068607069,
"grad_norm": 0.5776477138388799,
"learning_rate": 1.890208824605616e-05,
"loss": 0.708,
"step": 1360
},
{
"epoch": 0.7094594594594594,
"grad_norm": 0.5309187069380664,
"learning_rate": 1.888827049764219e-05,
"loss": 0.7003,
"step": 1365
},
{
"epoch": 0.7120582120582121,
"grad_norm": 0.5496529326574807,
"learning_rate": 1.8874371456294732e-05,
"loss": 0.6999,
"step": 1370
},
{
"epoch": 0.7146569646569647,
"grad_norm": 0.5339269514909717,
"learning_rate": 1.8860391249135692e-05,
"loss": 0.6966,
"step": 1375
},
{
"epoch": 0.7172557172557172,
"grad_norm": 0.5427973139574223,
"learning_rate": 1.884633000402931e-05,
"loss": 0.6936,
"step": 1380
},
{
"epoch": 0.7198544698544699,
"grad_norm": 0.5672590602791164,
"learning_rate": 1.883218784958103e-05,
"loss": 0.689,
"step": 1385
},
{
"epoch": 0.7224532224532224,
"grad_norm": 0.5402673826941384,
"learning_rate": 1.8817964915136277e-05,
"loss": 0.7072,
"step": 1390
},
{
"epoch": 0.725051975051975,
"grad_norm": 0.5601951835510618,
"learning_rate": 1.8803661330779316e-05,
"loss": 0.7059,
"step": 1395
},
{
"epoch": 0.7276507276507277,
"grad_norm": 0.5857868039965994,
"learning_rate": 1.8789277227332025e-05,
"loss": 0.6799,
"step": 1400
},
{
"epoch": 0.7302494802494802,
"grad_norm": 0.5347885501584507,
"learning_rate": 1.877481273635274e-05,
"loss": 0.6956,
"step": 1405
},
{
"epoch": 0.7328482328482329,
"grad_norm": 0.5502849975189612,
"learning_rate": 1.8760267990135007e-05,
"loss": 0.7059,
"step": 1410
},
{
"epoch": 0.7354469854469855,
"grad_norm": 0.5178257228797314,
"learning_rate": 1.874564312170641e-05,
"loss": 0.7019,
"step": 1415
},
{
"epoch": 0.738045738045738,
"grad_norm": 0.5607208259193451,
"learning_rate": 1.8730938264827322e-05,
"loss": 0.6963,
"step": 1420
},
{
"epoch": 0.7406444906444907,
"grad_norm": 0.5821162244405798,
"learning_rate": 1.8716153553989716e-05,
"loss": 0.6965,
"step": 1425
},
{
"epoch": 0.7432432432432432,
"grad_norm": 0.5495747594677731,
"learning_rate": 1.8701289124415902e-05,
"loss": 0.6963,
"step": 1430
},
{
"epoch": 0.7458419958419958,
"grad_norm": 0.528297292924797,
"learning_rate": 1.868634511205731e-05,
"loss": 0.6917,
"step": 1435
},
{
"epoch": 0.7484407484407485,
"grad_norm": 0.5326976953811587,
"learning_rate": 1.8671321653593244e-05,
"loss": 0.6989,
"step": 1440
},
{
"epoch": 0.751039501039501,
"grad_norm": 0.5584186177167862,
"learning_rate": 1.8656218886429624e-05,
"loss": 0.7031,
"step": 1445
},
{
"epoch": 0.7536382536382537,
"grad_norm": 0.5570198454055475,
"learning_rate": 1.8641036948697736e-05,
"loss": 0.7023,
"step": 1450
},
{
"epoch": 0.7562370062370062,
"grad_norm": 0.6760644666500142,
"learning_rate": 1.8625775979252976e-05,
"loss": 0.6789,
"step": 1455
},
{
"epoch": 0.7588357588357588,
"grad_norm": 0.5638434234347486,
"learning_rate": 1.8610436117673557e-05,
"loss": 0.6986,
"step": 1460
},
{
"epoch": 0.7614345114345115,
"grad_norm": 0.5493778378867652,
"learning_rate": 1.8595017504259253e-05,
"loss": 0.6785,
"step": 1465
},
{
"epoch": 0.764033264033264,
"grad_norm": 0.6031826832296197,
"learning_rate": 1.8579520280030118e-05,
"loss": 0.6995,
"step": 1470
},
{
"epoch": 0.7666320166320166,
"grad_norm": 0.5143780295012962,
"learning_rate": 1.8563944586725175e-05,
"loss": 0.6846,
"step": 1475
},
{
"epoch": 0.7692307692307693,
"grad_norm": 0.5539515728601708,
"learning_rate": 1.8548290566801132e-05,
"loss": 0.7238,
"step": 1480
},
{
"epoch": 0.7718295218295218,
"grad_norm": 0.5421409786755411,
"learning_rate": 1.853255836343109e-05,
"loss": 0.6999,
"step": 1485
},
{
"epoch": 0.7744282744282744,
"grad_norm": 0.6141673616193241,
"learning_rate": 1.8516748120503217e-05,
"loss": 0.6899,
"step": 1490
},
{
"epoch": 0.777027027027027,
"grad_norm": 0.6088321493956566,
"learning_rate": 1.8500859982619438e-05,
"loss": 0.6985,
"step": 1495
},
{
"epoch": 0.7796257796257796,
"grad_norm": 0.5792987579663321,
"learning_rate": 1.848489409509411e-05,
"loss": 0.7015,
"step": 1500
},
{
"epoch": 0.7822245322245323,
"grad_norm": 0.5889071004078938,
"learning_rate": 1.84688506039527e-05,
"loss": 0.6961,
"step": 1505
},
{
"epoch": 0.7848232848232848,
"grad_norm": 0.6583485451018368,
"learning_rate": 1.845272965593045e-05,
"loss": 0.6999,
"step": 1510
},
{
"epoch": 0.7874220374220374,
"grad_norm": 0.5605926278169279,
"learning_rate": 1.843653139847101e-05,
"loss": 0.6862,
"step": 1515
},
{
"epoch": 0.7900207900207901,
"grad_norm": 0.5528728709462963,
"learning_rate": 1.842025597972513e-05,
"loss": 0.697,
"step": 1520
},
{
"epoch": 0.7926195426195426,
"grad_norm": 0.5793992149063935,
"learning_rate": 1.840390354854927e-05,
"loss": 0.6981,
"step": 1525
},
{
"epoch": 0.7952182952182952,
"grad_norm": 0.5629064758499602,
"learning_rate": 1.8387474254504265e-05,
"loss": 0.6847,
"step": 1530
},
{
"epoch": 0.7978170478170478,
"grad_norm": 0.5625360669791298,
"learning_rate": 1.8370968247853933e-05,
"loss": 0.7102,
"step": 1535
},
{
"epoch": 0.8004158004158004,
"grad_norm": 0.575364667753087,
"learning_rate": 1.8354385679563723e-05,
"loss": 0.7028,
"step": 1540
},
{
"epoch": 0.803014553014553,
"grad_norm": 0.5391664994143878,
"learning_rate": 1.8337726701299313e-05,
"loss": 0.6972,
"step": 1545
},
{
"epoch": 0.8056133056133056,
"grad_norm": 0.5411008753649549,
"learning_rate": 1.8320991465425243e-05,
"loss": 0.6903,
"step": 1550
},
{
"epoch": 0.8082120582120582,
"grad_norm": 0.5247464270778599,
"learning_rate": 1.8304180125003505e-05,
"loss": 0.6892,
"step": 1555
},
{
"epoch": 0.8108108108108109,
"grad_norm": 0.5616645790978936,
"learning_rate": 1.8287292833792157e-05,
"loss": 0.6996,
"step": 1560
},
{
"epoch": 0.8134095634095634,
"grad_norm": 0.5496955252051037,
"learning_rate": 1.8270329746243903e-05,
"loss": 0.7093,
"step": 1565
},
{
"epoch": 0.816008316008316,
"grad_norm": 0.5727569676124988,
"learning_rate": 1.8253291017504694e-05,
"loss": 0.6921,
"step": 1570
},
{
"epoch": 0.8186070686070686,
"grad_norm": 0.5394235138224174,
"learning_rate": 1.8236176803412296e-05,
"loss": 0.6915,
"step": 1575
},
{
"epoch": 0.8212058212058212,
"grad_norm": 0.5545057246411995,
"learning_rate": 1.8218987260494877e-05,
"loss": 0.7076,
"step": 1580
},
{
"epoch": 0.8238045738045738,
"grad_norm": 0.5440533432959407,
"learning_rate": 1.820172254596956e-05,
"loss": 0.6765,
"step": 1585
},
{
"epoch": 0.8264033264033264,
"grad_norm": 0.5572733236733112,
"learning_rate": 1.8184382817741005e-05,
"loss": 0.699,
"step": 1590
},
{
"epoch": 0.829002079002079,
"grad_norm": 0.5531936584129153,
"learning_rate": 1.816696823439995e-05,
"loss": 0.6921,
"step": 1595
},
{
"epoch": 0.8316008316008316,
"grad_norm": 0.5885460971318106,
"learning_rate": 1.814947895522176e-05,
"loss": 0.7058,
"step": 1600
},
{
"epoch": 0.8341995841995842,
"grad_norm": 0.5258234834971192,
"learning_rate": 1.8131915140164985e-05,
"loss": 0.7075,
"step": 1605
},
{
"epoch": 0.8367983367983368,
"grad_norm": 0.6039050150490132,
"learning_rate": 1.8114276949869877e-05,
"loss": 0.7022,
"step": 1610
},
{
"epoch": 0.8393970893970893,
"grad_norm": 0.5562997078883312,
"learning_rate": 1.809656454565693e-05,
"loss": 0.7079,
"step": 1615
},
{
"epoch": 0.841995841995842,
"grad_norm": 0.5537066778477406,
"learning_rate": 1.8078778089525423e-05,
"loss": 0.6982,
"step": 1620
},
{
"epoch": 0.8445945945945946,
"grad_norm": 0.5336085274714755,
"learning_rate": 1.80609177441519e-05,
"loss": 0.6813,
"step": 1625
},
{
"epoch": 0.8471933471933472,
"grad_norm": 0.5921356105703777,
"learning_rate": 1.8042983672888706e-05,
"loss": 0.6982,
"step": 1630
},
{
"epoch": 0.8497920997920998,
"grad_norm": 0.5823716073373996,
"learning_rate": 1.8024976039762507e-05,
"loss": 0.7007,
"step": 1635
},
{
"epoch": 0.8523908523908524,
"grad_norm": 0.5413045444506639,
"learning_rate": 1.8006895009472747e-05,
"loss": 0.693,
"step": 1640
},
{
"epoch": 0.854989604989605,
"grad_norm": 0.5634810470157456,
"learning_rate": 1.7988740747390182e-05,
"loss": 0.6973,
"step": 1645
},
{
"epoch": 0.8575883575883576,
"grad_norm": 0.5742831274552044,
"learning_rate": 1.797051341955536e-05,
"loss": 0.6983,
"step": 1650
},
{
"epoch": 0.8601871101871101,
"grad_norm": 0.5870076478023661,
"learning_rate": 1.7952213192677074e-05,
"loss": 0.7139,
"step": 1655
},
{
"epoch": 0.8627858627858628,
"grad_norm": 0.5730171778092863,
"learning_rate": 1.7933840234130878e-05,
"loss": 0.7048,
"step": 1660
},
{
"epoch": 0.8653846153846154,
"grad_norm": 0.5104051114581488,
"learning_rate": 1.7915394711957523e-05,
"loss": 0.7073,
"step": 1665
},
{
"epoch": 0.867983367983368,
"grad_norm": 0.5580415382036159,
"learning_rate": 1.7896876794861443e-05,
"loss": 0.6942,
"step": 1670
},
{
"epoch": 0.8705821205821206,
"grad_norm": 0.5837643043381491,
"learning_rate": 1.7878286652209196e-05,
"loss": 0.7025,
"step": 1675
},
{
"epoch": 0.8731808731808732,
"grad_norm": 0.5423350178745967,
"learning_rate": 1.785962445402792e-05,
"loss": 0.6952,
"step": 1680
},
{
"epoch": 0.8757796257796258,
"grad_norm": 0.5729568854084454,
"learning_rate": 1.7840890371003795e-05,
"loss": 0.6966,
"step": 1685
},
{
"epoch": 0.8783783783783784,
"grad_norm": 0.5553594551886265,
"learning_rate": 1.782208457448044e-05,
"loss": 0.7013,
"step": 1690
},
{
"epoch": 0.8809771309771309,
"grad_norm": 0.535467096093925,
"learning_rate": 1.7803207236457404e-05,
"loss": 0.7082,
"step": 1695
},
{
"epoch": 0.8835758835758836,
"grad_norm": 0.5486284072585131,
"learning_rate": 1.778425852958853e-05,
"loss": 0.6666,
"step": 1700
},
{
"epoch": 0.8861746361746362,
"grad_norm": 0.5078876333931026,
"learning_rate": 1.7765238627180424e-05,
"loss": 0.6894,
"step": 1705
},
{
"epoch": 0.8887733887733887,
"grad_norm": 0.5667869632736022,
"learning_rate": 1.7746147703190857e-05,
"loss": 0.704,
"step": 1710
},
{
"epoch": 0.8913721413721414,
"grad_norm": 0.574959887360108,
"learning_rate": 1.7726985932227156e-05,
"loss": 0.7107,
"step": 1715
},
{
"epoch": 0.893970893970894,
"grad_norm": 0.5224789850325783,
"learning_rate": 1.7707753489544628e-05,
"loss": 0.7047,
"step": 1720
},
{
"epoch": 0.8965696465696466,
"grad_norm": 0.5527415568002146,
"learning_rate": 1.768845055104495e-05,
"loss": 0.7091,
"step": 1725
},
{
"epoch": 0.8991683991683992,
"grad_norm": 0.5812394569236012,
"learning_rate": 1.7669077293274564e-05,
"loss": 0.6862,
"step": 1730
},
{
"epoch": 0.9017671517671517,
"grad_norm": 0.5235725170689791,
"learning_rate": 1.764963389342305e-05,
"loss": 0.6811,
"step": 1735
},
{
"epoch": 0.9043659043659044,
"grad_norm": 0.5171430012007782,
"learning_rate": 1.7630120529321518e-05,
"loss": 0.6846,
"step": 1740
},
{
"epoch": 0.906964656964657,
"grad_norm": 0.528693038590328,
"learning_rate": 1.7610537379440987e-05,
"loss": 0.6915,
"step": 1745
},
{
"epoch": 0.9095634095634095,
"grad_norm": 0.5364540922380395,
"learning_rate": 1.759088462289072e-05,
"loss": 0.684,
"step": 1750
},
{
"epoch": 0.9121621621621622,
"grad_norm": 0.5660558568326042,
"learning_rate": 1.7571162439416632e-05,
"loss": 0.6955,
"step": 1755
},
{
"epoch": 0.9147609147609148,
"grad_norm": 0.5672957617937873,
"learning_rate": 1.755137100939961e-05,
"loss": 0.6988,
"step": 1760
},
{
"epoch": 0.9173596673596673,
"grad_norm": 0.5530873304373302,
"learning_rate": 1.753151051385388e-05,
"loss": 0.6833,
"step": 1765
},
{
"epoch": 0.91995841995842,
"grad_norm": 0.5178111657664748,
"learning_rate": 1.7511581134425347e-05,
"loss": 0.7073,
"step": 1770
},
{
"epoch": 0.9225571725571725,
"grad_norm": 0.5721288757020301,
"learning_rate": 1.7491583053389937e-05,
"loss": 0.6863,
"step": 1775
},
{
"epoch": 0.9251559251559252,
"grad_norm": 0.5726074152322754,
"learning_rate": 1.7471516453651925e-05,
"loss": 0.6885,
"step": 1780
},
{
"epoch": 0.9277546777546778,
"grad_norm": 0.5569113799035454,
"learning_rate": 1.7451381518742264e-05,
"loss": 0.6919,
"step": 1785
},
{
"epoch": 0.9303534303534303,
"grad_norm": 0.5349940876609687,
"learning_rate": 1.7431178432816905e-05,
"loss": 0.6888,
"step": 1790
},
{
"epoch": 0.932952182952183,
"grad_norm": 0.48085486147721074,
"learning_rate": 1.7410907380655118e-05,
"loss": 0.6892,
"step": 1795
},
{
"epoch": 0.9355509355509356,
"grad_norm": 0.5162470850450532,
"learning_rate": 1.7390568547657797e-05,
"loss": 0.6844,
"step": 1800
},
{
"epoch": 0.9381496881496881,
"grad_norm": 0.5500432932817269,
"learning_rate": 1.7370162119845768e-05,
"loss": 0.677,
"step": 1805
},
{
"epoch": 0.9407484407484408,
"grad_norm": 0.5691270831237378,
"learning_rate": 1.734968828385808e-05,
"loss": 0.6816,
"step": 1810
},
{
"epoch": 0.9433471933471933,
"grad_norm": 0.5353076421264558,
"learning_rate": 1.7329147226950303e-05,
"loss": 0.6825,
"step": 1815
},
{
"epoch": 0.9459459459459459,
"grad_norm": 0.552477154180168,
"learning_rate": 1.7308539136992823e-05,
"loss": 0.6893,
"step": 1820
},
{
"epoch": 0.9485446985446986,
"grad_norm": 0.5280777987730796,
"learning_rate": 1.7287864202469117e-05,
"loss": 0.7004,
"step": 1825
},
{
"epoch": 0.9511434511434511,
"grad_norm": 0.5437828698378319,
"learning_rate": 1.7267122612474013e-05,
"loss": 0.6761,
"step": 1830
},
{
"epoch": 0.9537422037422038,
"grad_norm": 0.5687279165024458,
"learning_rate": 1.7246314556711994e-05,
"loss": 0.6894,
"step": 1835
},
{
"epoch": 0.9563409563409564,
"grad_norm": 0.5740312633264971,
"learning_rate": 1.7225440225495436e-05,
"loss": 0.6914,
"step": 1840
},
{
"epoch": 0.9589397089397089,
"grad_norm": 0.5573795518397149,
"learning_rate": 1.720449980974288e-05,
"loss": 0.6771,
"step": 1845
},
{
"epoch": 0.9615384615384616,
"grad_norm": 0.5351408449090207,
"learning_rate": 1.7183493500977277e-05,
"loss": 0.6932,
"step": 1850
},
{
"epoch": 0.9641372141372141,
"grad_norm": 0.5528674527887268,
"learning_rate": 1.7162421491324247e-05,
"loss": 0.6836,
"step": 1855
},
{
"epoch": 0.9667359667359667,
"grad_norm": 0.5406423387911308,
"learning_rate": 1.7141283973510313e-05,
"loss": 0.691,
"step": 1860
},
{
"epoch": 0.9693347193347194,
"grad_norm": 0.5420681920741066,
"learning_rate": 1.712008114086115e-05,
"loss": 0.7039,
"step": 1865
},
{
"epoch": 0.9719334719334719,
"grad_norm": 0.496824504759365,
"learning_rate": 1.7098813187299786e-05,
"loss": 0.692,
"step": 1870
},
{
"epoch": 0.9745322245322245,
"grad_norm": 0.5324381566943999,
"learning_rate": 1.707748030734488e-05,
"loss": 0.6776,
"step": 1875
},
{
"epoch": 0.9771309771309772,
"grad_norm": 0.5230259681581492,
"learning_rate": 1.7056082696108896e-05,
"loss": 0.6847,
"step": 1880
},
{
"epoch": 0.9797297297297297,
"grad_norm": 0.5404603350045258,
"learning_rate": 1.7034620549296336e-05,
"loss": 0.6896,
"step": 1885
},
{
"epoch": 0.9823284823284824,
"grad_norm": 0.5198497643717813,
"learning_rate": 1.701309406320196e-05,
"loss": 0.6676,
"step": 1890
},
{
"epoch": 0.9849272349272349,
"grad_norm": 0.49415192104030464,
"learning_rate": 1.699150343470897e-05,
"loss": 0.6839,
"step": 1895
},
{
"epoch": 0.9875259875259875,
"grad_norm": 0.5044834255400932,
"learning_rate": 1.696984886128723e-05,
"loss": 0.6913,
"step": 1900
},
{
"epoch": 0.9901247401247402,
"grad_norm": 0.5398617776602235,
"learning_rate": 1.6948130540991443e-05,
"loss": 0.6874,
"step": 1905
},
{
"epoch": 0.9927234927234927,
"grad_norm": 0.5225589262440207,
"learning_rate": 1.6926348672459347e-05,
"loss": 0.6822,
"step": 1910
},
{
"epoch": 0.9953222453222453,
"grad_norm": 0.5351932477818484,
"learning_rate": 1.6904503454909905e-05,
"loss": 0.668,
"step": 1915
},
{
"epoch": 0.997920997920998,
"grad_norm": 0.5596299212706576,
"learning_rate": 1.688259508814147e-05,
"loss": 0.6884,
"step": 1920
},
{
"epoch": 1.0,
"eval_loss": 0.7763931155204773,
"eval_runtime": 106.5617,
"eval_samples_per_second": 77.045,
"eval_steps_per_second": 1.211,
"step": 1924
},
{
"epoch": 1.0005197505197505,
"grad_norm": 0.7015242091925799,
"learning_rate": 1.6860623772529964e-05,
"loss": 0.6682,
"step": 1925
},
{
"epoch": 1.003118503118503,
"grad_norm": 0.6034586283492043,
"learning_rate": 1.6838589709027043e-05,
"loss": 0.6255,
"step": 1930
},
{
"epoch": 1.0057172557172558,
"grad_norm": 0.6673777384785803,
"learning_rate": 1.681649309915827e-05,
"loss": 0.615,
"step": 1935
},
{
"epoch": 1.0083160083160083,
"grad_norm": 0.55969047689154,
"learning_rate": 1.6794334145021252e-05,
"loss": 0.6276,
"step": 1940
},
{
"epoch": 1.0109147609147608,
"grad_norm": 0.5655945464921515,
"learning_rate": 1.677211304928381e-05,
"loss": 0.6072,
"step": 1945
},
{
"epoch": 1.0135135135135136,
"grad_norm": 0.5544352654106766,
"learning_rate": 1.6749830015182106e-05,
"loss": 0.604,
"step": 1950
},
{
"epoch": 1.0161122661122661,
"grad_norm": 0.5513580613594744,
"learning_rate": 1.6727485246518813e-05,
"loss": 0.6087,
"step": 1955
},
{
"epoch": 1.0187110187110187,
"grad_norm": 0.5475994898582014,
"learning_rate": 1.6705078947661224e-05,
"loss": 0.6125,
"step": 1960
},
{
"epoch": 1.0213097713097714,
"grad_norm": 0.5469347962338588,
"learning_rate": 1.668261132353939e-05,
"loss": 0.6079,
"step": 1965
},
{
"epoch": 1.023908523908524,
"grad_norm": 0.6609443347963427,
"learning_rate": 1.6660082579644257e-05,
"loss": 0.6085,
"step": 1970
},
{
"epoch": 1.0265072765072765,
"grad_norm": 0.5735256704279655,
"learning_rate": 1.6637492922025767e-05,
"loss": 0.5988,
"step": 1975
},
{
"epoch": 1.0291060291060292,
"grad_norm": 0.5604570622446723,
"learning_rate": 1.6614842557291003e-05,
"loss": 0.605,
"step": 1980
},
{
"epoch": 1.0317047817047817,
"grad_norm": 0.5716162653407055,
"learning_rate": 1.6592131692602257e-05,
"loss": 0.6199,
"step": 1985
},
{
"epoch": 1.0343035343035343,
"grad_norm": 0.572255735683218,
"learning_rate": 1.6569360535675177e-05,
"loss": 0.6136,
"step": 1990
},
{
"epoch": 1.0369022869022868,
"grad_norm": 0.5548434711803698,
"learning_rate": 1.654652929477684e-05,
"loss": 0.6292,
"step": 1995
},
{
"epoch": 1.0395010395010396,
"grad_norm": 0.5372326277134161,
"learning_rate": 1.6523638178723863e-05,
"loss": 0.615,
"step": 2000
},
{
"epoch": 1.042099792099792,
"grad_norm": 0.5629251525957598,
"learning_rate": 1.6500687396880483e-05,
"loss": 0.5994,
"step": 2005
},
{
"epoch": 1.0446985446985446,
"grad_norm": 0.5386319104306212,
"learning_rate": 1.6477677159156647e-05,
"loss": 0.6074,
"step": 2010
},
{
"epoch": 1.0472972972972974,
"grad_norm": 0.5344519142234625,
"learning_rate": 1.6454607676006085e-05,
"loss": 0.6093,
"step": 2015
},
{
"epoch": 1.04989604989605,
"grad_norm": 0.5911538942849666,
"learning_rate": 1.64314791584244e-05,
"loss": 0.6219,
"step": 2020
},
{
"epoch": 1.0524948024948024,
"grad_norm": 0.5747632841380809,
"learning_rate": 1.6408291817947126e-05,
"loss": 0.6229,
"step": 2025
},
{
"epoch": 1.0550935550935552,
"grad_norm": 0.5553970996601802,
"learning_rate": 1.6385045866647797e-05,
"loss": 0.6131,
"step": 2030
},
{
"epoch": 1.0576923076923077,
"grad_norm": 0.54652804470796,
"learning_rate": 1.6361741517136e-05,
"loss": 0.6189,
"step": 2035
},
{
"epoch": 1.0602910602910602,
"grad_norm": 0.5652320668164962,
"learning_rate": 1.633837898255545e-05,
"loss": 0.6206,
"step": 2040
},
{
"epoch": 1.062889812889813,
"grad_norm": 0.5377492795503913,
"learning_rate": 1.631495847658202e-05,
"loss": 0.6246,
"step": 2045
},
{
"epoch": 1.0654885654885655,
"grad_norm": 0.5701190357161201,
"learning_rate": 1.6291480213421796e-05,
"loss": 0.6151,
"step": 2050
},
{
"epoch": 1.068087318087318,
"grad_norm": 0.5661751524816903,
"learning_rate": 1.626794440780911e-05,
"loss": 0.6155,
"step": 2055
},
{
"epoch": 1.0706860706860706,
"grad_norm": 0.563344281114797,
"learning_rate": 1.62443512750046e-05,
"loss": 0.6065,
"step": 2060
},
{
"epoch": 1.0732848232848233,
"grad_norm": 0.5605142234460203,
"learning_rate": 1.6220701030793203e-05,
"loss": 0.6182,
"step": 2065
},
{
"epoch": 1.0758835758835759,
"grad_norm": 0.5276428593516702,
"learning_rate": 1.6196993891482216e-05,
"loss": 0.622,
"step": 2070
},
{
"epoch": 1.0784823284823284,
"grad_norm": 0.5855580679475535,
"learning_rate": 1.6173230073899303e-05,
"loss": 0.613,
"step": 2075
},
{
"epoch": 1.0810810810810811,
"grad_norm": 0.5464688542980072,
"learning_rate": 1.6149409795390503e-05,
"loss": 0.6109,
"step": 2080
},
{
"epoch": 1.0836798336798337,
"grad_norm": 0.523861090524199,
"learning_rate": 1.6125533273818257e-05,
"loss": 0.5932,
"step": 2085
},
{
"epoch": 1.0862785862785862,
"grad_norm": 0.5800436769814354,
"learning_rate": 1.6101600727559423e-05,
"loss": 0.5974,
"step": 2090
},
{
"epoch": 1.088877338877339,
"grad_norm": 0.5314763619677401,
"learning_rate": 1.6077612375503244e-05,
"loss": 0.6233,
"step": 2095
},
{
"epoch": 1.0914760914760915,
"grad_norm": 0.5292466384443512,
"learning_rate": 1.605356843704938e-05,
"loss": 0.6082,
"step": 2100
},
{
"epoch": 1.094074844074844,
"grad_norm": 0.5877188139637917,
"learning_rate": 1.6029469132105886e-05,
"loss": 0.6255,
"step": 2105
},
{
"epoch": 1.0966735966735968,
"grad_norm": 0.5760369419098388,
"learning_rate": 1.6005314681087208e-05,
"loss": 0.6157,
"step": 2110
},
{
"epoch": 1.0992723492723493,
"grad_norm": 0.630752472432159,
"learning_rate": 1.598110530491216e-05,
"loss": 0.6175,
"step": 2115
},
{
"epoch": 1.1018711018711018,
"grad_norm": 0.5608953698596442,
"learning_rate": 1.595684122500191e-05,
"loss": 0.6177,
"step": 2120
},
{
"epoch": 1.1044698544698546,
"grad_norm": 0.573695393031942,
"learning_rate": 1.593252266327794e-05,
"loss": 0.6243,
"step": 2125
},
{
"epoch": 1.107068607068607,
"grad_norm": 0.5935701794951512,
"learning_rate": 1.590814984216004e-05,
"loss": 0.6134,
"step": 2130
},
{
"epoch": 1.1096673596673596,
"grad_norm": 0.5895903291761935,
"learning_rate": 1.588372298456426e-05,
"loss": 0.6082,
"step": 2135
},
{
"epoch": 1.1122661122661124,
"grad_norm": 0.5711362021969438,
"learning_rate": 1.5859242313900866e-05,
"loss": 0.6048,
"step": 2140
},
{
"epoch": 1.114864864864865,
"grad_norm": 0.5761578602169135,
"learning_rate": 1.583470805407231e-05,
"loss": 0.619,
"step": 2145
},
{
"epoch": 1.1174636174636174,
"grad_norm": 0.5567866303525553,
"learning_rate": 1.581012042947117e-05,
"loss": 0.6112,
"step": 2150
},
{
"epoch": 1.12006237006237,
"grad_norm": 0.5493991058746482,
"learning_rate": 1.578547966497811e-05,
"loss": 0.5976,
"step": 2155
},
{
"epoch": 1.1226611226611227,
"grad_norm": 0.5652320554508646,
"learning_rate": 1.57607859859598e-05,
"loss": 0.6048,
"step": 2160
},
{
"epoch": 1.1252598752598753,
"grad_norm": 0.5401641304994612,
"learning_rate": 1.57360396182669e-05,
"loss": 0.6082,
"step": 2165
},
{
"epoch": 1.1278586278586278,
"grad_norm": 0.5606818865719918,
"learning_rate": 1.5711240788231933e-05,
"loss": 0.6039,
"step": 2170
},
{
"epoch": 1.1304573804573805,
"grad_norm": 0.6007383546804671,
"learning_rate": 1.5686389722667273e-05,
"loss": 0.6047,
"step": 2175
},
{
"epoch": 1.133056133056133,
"grad_norm": 0.5715756539794042,
"learning_rate": 1.5661486648863027e-05,
"loss": 0.6252,
"step": 2180
},
{
"epoch": 1.1356548856548856,
"grad_norm": 0.6079845247405427,
"learning_rate": 1.563653179458499e-05,
"loss": 0.6099,
"step": 2185
},
{
"epoch": 1.1382536382536383,
"grad_norm": 0.5663226785265596,
"learning_rate": 1.5611525388072525e-05,
"loss": 0.5996,
"step": 2190
},
{
"epoch": 1.1408523908523909,
"grad_norm": 0.6108898947357355,
"learning_rate": 1.5586467658036526e-05,
"loss": 0.6209,
"step": 2195
},
{
"epoch": 1.1434511434511434,
"grad_norm": 0.6420427561575582,
"learning_rate": 1.556135883365727e-05,
"loss": 0.6038,
"step": 2200
},
{
"epoch": 1.1460498960498962,
"grad_norm": 0.587335250663389,
"learning_rate": 1.5536199144582354e-05,
"loss": 0.6242,
"step": 2205
},
{
"epoch": 1.1486486486486487,
"grad_norm": 0.5910496137391441,
"learning_rate": 1.5510988820924598e-05,
"loss": 0.6069,
"step": 2210
},
{
"epoch": 1.1512474012474012,
"grad_norm": 0.5655552313228328,
"learning_rate": 1.5485728093259923e-05,
"loss": 0.6225,
"step": 2215
},
{
"epoch": 1.1538461538461537,
"grad_norm": 0.554660591831712,
"learning_rate": 1.5460417192625245e-05,
"loss": 0.6121,
"step": 2220
},
{
"epoch": 1.1564449064449065,
"grad_norm": 0.5924033501687683,
"learning_rate": 1.5435056350516376e-05,
"loss": 0.6108,
"step": 2225
},
{
"epoch": 1.159043659043659,
"grad_norm": 0.5678195976691061,
"learning_rate": 1.54096457988859e-05,
"loss": 0.6146,
"step": 2230
},
{
"epoch": 1.1616424116424116,
"grad_norm": 0.582762073846593,
"learning_rate": 1.5384185770141027e-05,
"loss": 0.6116,
"step": 2235
},
{
"epoch": 1.1642411642411643,
"grad_norm": 0.5520839459854381,
"learning_rate": 1.535867649714152e-05,
"loss": 0.6167,
"step": 2240
},
{
"epoch": 1.1668399168399168,
"grad_norm": 0.5394988505298011,
"learning_rate": 1.533311821319751e-05,
"loss": 0.6173,
"step": 2245
},
{
"epoch": 1.1694386694386694,
"grad_norm": 0.5490162258104867,
"learning_rate": 1.5307511152067397e-05,
"loss": 0.6195,
"step": 2250
},
{
"epoch": 1.1720374220374221,
"grad_norm": 0.5176946937084966,
"learning_rate": 1.5281855547955704e-05,
"loss": 0.6063,
"step": 2255
},
{
"epoch": 1.1746361746361746,
"grad_norm": 0.5697232320984311,
"learning_rate": 1.5256151635510925e-05,
"loss": 0.6132,
"step": 2260
},
{
"epoch": 1.1772349272349272,
"grad_norm": 0.5408355403813135,
"learning_rate": 1.5230399649823389e-05,
"loss": 0.6202,
"step": 2265
},
{
"epoch": 1.17983367983368,
"grad_norm": 0.5504776040838202,
"learning_rate": 1.5204599826423108e-05,
"loss": 0.6121,
"step": 2270
},
{
"epoch": 1.1824324324324325,
"grad_norm": 0.5337013368651256,
"learning_rate": 1.5178752401277628e-05,
"loss": 0.616,
"step": 2275
},
{
"epoch": 1.185031185031185,
"grad_norm": 0.562149132935065,
"learning_rate": 1.5152857610789854e-05,
"loss": 0.6097,
"step": 2280
},
{
"epoch": 1.1876299376299375,
"grad_norm": 0.5909197735161369,
"learning_rate": 1.5126915691795905e-05,
"loss": 0.6188,
"step": 2285
},
{
"epoch": 1.1902286902286903,
"grad_norm": 0.5535938243322149,
"learning_rate": 1.5100926881562936e-05,
"loss": 0.6137,
"step": 2290
},
{
"epoch": 1.1928274428274428,
"grad_norm": 0.544767406909682,
"learning_rate": 1.5074891417786993e-05,
"loss": 0.6133,
"step": 2295
},
{
"epoch": 1.1954261954261955,
"grad_norm": 0.5459850942463099,
"learning_rate": 1.5048809538590789e-05,
"loss": 0.613,
"step": 2300
},
{
"epoch": 1.198024948024948,
"grad_norm": 0.5873358493955128,
"learning_rate": 1.5022681482521579e-05,
"loss": 0.6156,
"step": 2305
},
{
"epoch": 1.2006237006237006,
"grad_norm": 0.5644324461104552,
"learning_rate": 1.499650748854895e-05,
"loss": 0.6155,
"step": 2310
},
{
"epoch": 1.2032224532224531,
"grad_norm": 0.5531535214490884,
"learning_rate": 1.4970287796062642e-05,
"loss": 0.6191,
"step": 2315
},
{
"epoch": 1.2058212058212059,
"grad_norm": 0.5509179294326446,
"learning_rate": 1.494402264487035e-05,
"loss": 0.614,
"step": 2320
},
{
"epoch": 1.2084199584199584,
"grad_norm": 0.5585470168515849,
"learning_rate": 1.491771227519555e-05,
"loss": 0.6139,
"step": 2325
},
{
"epoch": 1.211018711018711,
"grad_norm": 0.5129593419686834,
"learning_rate": 1.4891356927675284e-05,
"loss": 0.6089,
"step": 2330
},
{
"epoch": 1.2136174636174637,
"grad_norm": 0.5920443075253277,
"learning_rate": 1.4864956843357967e-05,
"loss": 0.63,
"step": 2335
},
{
"epoch": 1.2162162162162162,
"grad_norm": 0.5559902991412571,
"learning_rate": 1.4838512263701184e-05,
"loss": 0.6228,
"step": 2340
},
{
"epoch": 1.2188149688149688,
"grad_norm": 0.5643995055948857,
"learning_rate": 1.4812023430569467e-05,
"loss": 0.619,
"step": 2345
},
{
"epoch": 1.2214137214137215,
"grad_norm": 0.5742853786867631,
"learning_rate": 1.4785490586232108e-05,
"loss": 0.6245,
"step": 2350
},
{
"epoch": 1.224012474012474,
"grad_norm": 0.5778953782438334,
"learning_rate": 1.4758913973360919e-05,
"loss": 0.6227,
"step": 2355
},
{
"epoch": 1.2266112266112266,
"grad_norm": 0.5925914426786582,
"learning_rate": 1.4732293835028038e-05,
"loss": 0.6107,
"step": 2360
},
{
"epoch": 1.2292099792099793,
"grad_norm": 0.5895371651072315,
"learning_rate": 1.4705630414703669e-05,
"loss": 0.6057,
"step": 2365
},
{
"epoch": 1.2318087318087318,
"grad_norm": 0.6081772444953167,
"learning_rate": 1.4678923956253894e-05,
"loss": 0.6424,
"step": 2370
},
{
"epoch": 1.2344074844074844,
"grad_norm": 0.5933961879145944,
"learning_rate": 1.4652174703938422e-05,
"loss": 0.6128,
"step": 2375
},
{
"epoch": 1.237006237006237,
"grad_norm": 0.6054620771138413,
"learning_rate": 1.4625382902408356e-05,
"loss": 0.6084,
"step": 2380
},
{
"epoch": 1.2396049896049897,
"grad_norm": 0.5776932281070712,
"learning_rate": 1.4598548796703953e-05,
"loss": 0.6217,
"step": 2385
},
{
"epoch": 1.2422037422037422,
"grad_norm": 0.5591153237371339,
"learning_rate": 1.4571672632252404e-05,
"loss": 0.6059,
"step": 2390
},
{
"epoch": 1.2448024948024947,
"grad_norm": 0.5667751253010028,
"learning_rate": 1.4544754654865553e-05,
"loss": 0.6269,
"step": 2395
},
{
"epoch": 1.2474012474012475,
"grad_norm": 0.5510576618147843,
"learning_rate": 1.4517795110737687e-05,
"loss": 0.6175,
"step": 2400
},
{
"epoch": 1.25,
"grad_norm": 0.5653685584114336,
"learning_rate": 1.4490794246443249e-05,
"loss": 0.6141,
"step": 2405
},
{
"epoch": 1.2525987525987525,
"grad_norm": 0.569054506821339,
"learning_rate": 1.446375230893462e-05,
"loss": 0.6132,
"step": 2410
},
{
"epoch": 1.255197505197505,
"grad_norm": 0.5530850077073164,
"learning_rate": 1.4436669545539824e-05,
"loss": 0.6112,
"step": 2415
},
{
"epoch": 1.2577962577962578,
"grad_norm": 0.5413151446394687,
"learning_rate": 1.4409546203960284e-05,
"loss": 0.6032,
"step": 2420
},
{
"epoch": 1.2603950103950103,
"grad_norm": 0.5230951552758679,
"learning_rate": 1.4382382532268566e-05,
"loss": 0.6144,
"step": 2425
},
{
"epoch": 1.262993762993763,
"grad_norm": 0.541771918919958,
"learning_rate": 1.4355178778906085e-05,
"loss": 0.6234,
"step": 2430
},
{
"epoch": 1.2655925155925156,
"grad_norm": 0.5203001197628181,
"learning_rate": 1.4327935192680857e-05,
"loss": 0.6045,
"step": 2435
},
{
"epoch": 1.2681912681912682,
"grad_norm": 0.5440655504089812,
"learning_rate": 1.4300652022765207e-05,
"loss": 0.6139,
"step": 2440
},
{
"epoch": 1.2707900207900207,
"grad_norm": 0.6149133483770466,
"learning_rate": 1.4273329518693497e-05,
"loss": 0.6145,
"step": 2445
},
{
"epoch": 1.2733887733887734,
"grad_norm": 0.6021509402407774,
"learning_rate": 1.4245967930359848e-05,
"loss": 0.6159,
"step": 2450
},
{
"epoch": 1.275987525987526,
"grad_norm": 0.5913158357105107,
"learning_rate": 1.4218567508015841e-05,
"loss": 0.6168,
"step": 2455
},
{
"epoch": 1.2785862785862787,
"grad_norm": 0.5618432626028342,
"learning_rate": 1.4191128502268242e-05,
"loss": 0.6152,
"step": 2460
},
{
"epoch": 1.2811850311850312,
"grad_norm": 0.5249984782845095,
"learning_rate": 1.4163651164076705e-05,
"loss": 0.6086,
"step": 2465
},
{
"epoch": 1.2837837837837838,
"grad_norm": 0.5672830278319703,
"learning_rate": 1.4136135744751468e-05,
"loss": 0.6114,
"step": 2470
},
{
"epoch": 1.2863825363825363,
"grad_norm": 0.5682891875562709,
"learning_rate": 1.4108582495951077e-05,
"loss": 0.6148,
"step": 2475
},
{
"epoch": 1.288981288981289,
"grad_norm": 0.5615341097983116,
"learning_rate": 1.408099166968005e-05,
"loss": 0.6111,
"step": 2480
},
{
"epoch": 1.2915800415800416,
"grad_norm": 0.5497563938968811,
"learning_rate": 1.4053363518286613e-05,
"loss": 0.6088,
"step": 2485
},
{
"epoch": 1.2941787941787941,
"grad_norm": 0.5582405570031684,
"learning_rate": 1.4025698294460362e-05,
"loss": 0.6136,
"step": 2490
},
{
"epoch": 1.2967775467775469,
"grad_norm": 0.6011380118880273,
"learning_rate": 1.3997996251229948e-05,
"loss": 0.6186,
"step": 2495
},
{
"epoch": 1.2993762993762994,
"grad_norm": 0.5496562610843831,
"learning_rate": 1.3970257641960795e-05,
"loss": 0.6182,
"step": 2500
},
{
"epoch": 1.301975051975052,
"grad_norm": 0.5687796275549053,
"learning_rate": 1.3942482720352761e-05,
"loss": 0.6157,
"step": 2505
},
{
"epoch": 1.3045738045738045,
"grad_norm": 0.574298920577317,
"learning_rate": 1.3914671740437811e-05,
"loss": 0.6136,
"step": 2510
},
{
"epoch": 1.3071725571725572,
"grad_norm": 0.5542768495449328,
"learning_rate": 1.3886824956577702e-05,
"loss": 0.6031,
"step": 2515
},
{
"epoch": 1.3097713097713097,
"grad_norm": 0.5666521327715712,
"learning_rate": 1.3858942623461664e-05,
"loss": 0.6062,
"step": 2520
},
{
"epoch": 1.3123700623700625,
"grad_norm": 0.5383202751991224,
"learning_rate": 1.3831024996104065e-05,
"loss": 0.6119,
"step": 2525
},
{
"epoch": 1.314968814968815,
"grad_norm": 0.550924324768737,
"learning_rate": 1.3803072329842073e-05,
"loss": 0.6218,
"step": 2530
},
{
"epoch": 1.3175675675675675,
"grad_norm": 0.5715325257279636,
"learning_rate": 1.3775084880333323e-05,
"loss": 0.6197,
"step": 2535
},
{
"epoch": 1.32016632016632,
"grad_norm": 0.5516314324953223,
"learning_rate": 1.3747062903553582e-05,
"loss": 0.5983,
"step": 2540
},
{
"epoch": 1.3227650727650728,
"grad_norm": 0.5587681122677882,
"learning_rate": 1.3719006655794414e-05,
"loss": 0.6104,
"step": 2545
},
{
"epoch": 1.3253638253638254,
"grad_norm": 0.5529619265877077,
"learning_rate": 1.3690916393660815e-05,
"loss": 0.6232,
"step": 2550
},
{
"epoch": 1.3279625779625779,
"grad_norm": 0.6007892832321496,
"learning_rate": 1.3662792374068896e-05,
"loss": 0.6246,
"step": 2555
},
{
"epoch": 1.3305613305613306,
"grad_norm": 0.5102078314524738,
"learning_rate": 1.3634634854243503e-05,
"loss": 0.6037,
"step": 2560
},
{
"epoch": 1.3331600831600832,
"grad_norm": 0.5068981925325898,
"learning_rate": 1.3606444091715883e-05,
"loss": 0.6056,
"step": 2565
},
{
"epoch": 1.3357588357588357,
"grad_norm": 0.5201200155890484,
"learning_rate": 1.3578220344321325e-05,
"loss": 0.6088,
"step": 2570
},
{
"epoch": 1.3383575883575882,
"grad_norm": 0.5411417638449072,
"learning_rate": 1.3549963870196796e-05,
"loss": 0.606,
"step": 2575
},
{
"epoch": 1.340956340956341,
"grad_norm": 0.5169808096315553,
"learning_rate": 1.3521674927778594e-05,
"loss": 0.6278,
"step": 2580
},
{
"epoch": 1.3435550935550935,
"grad_norm": 0.5658934679962141,
"learning_rate": 1.3493353775799967e-05,
"loss": 0.6067,
"step": 2585
},
{
"epoch": 1.3461538461538463,
"grad_norm": 0.5724238241800808,
"learning_rate": 1.3465000673288757e-05,
"loss": 0.6003,
"step": 2590
},
{
"epoch": 1.3487525987525988,
"grad_norm": 0.6105368545978801,
"learning_rate": 1.3436615879565025e-05,
"loss": 0.616,
"step": 2595
},
{
"epoch": 1.3513513513513513,
"grad_norm": 0.5188576936304327,
"learning_rate": 1.340819965423869e-05,
"loss": 0.6283,
"step": 2600
},
{
"epoch": 1.3539501039501038,
"grad_norm": 0.4959836182939828,
"learning_rate": 1.3379752257207144e-05,
"loss": 0.6157,
"step": 2605
},
{
"epoch": 1.3565488565488566,
"grad_norm": 0.5769448388897034,
"learning_rate": 1.3351273948652872e-05,
"loss": 0.6133,
"step": 2610
},
{
"epoch": 1.3591476091476091,
"grad_norm": 0.5647777721810548,
"learning_rate": 1.3322764989041086e-05,
"loss": 0.6047,
"step": 2615
},
{
"epoch": 1.3617463617463619,
"grad_norm": 0.5362269489941972,
"learning_rate": 1.329422563911734e-05,
"loss": 0.6244,
"step": 2620
},
{
"epoch": 1.3643451143451144,
"grad_norm": 0.5876277649004987,
"learning_rate": 1.326565615990513e-05,
"loss": 0.6094,
"step": 2625
},
{
"epoch": 1.366943866943867,
"grad_norm": 0.5771702605216373,
"learning_rate": 1.3237056812703517e-05,
"loss": 0.6162,
"step": 2630
},
{
"epoch": 1.3695426195426195,
"grad_norm": 0.5206111176210121,
"learning_rate": 1.3208427859084743e-05,
"loss": 0.5991,
"step": 2635
},
{
"epoch": 1.3721413721413722,
"grad_norm": 0.5703420517094763,
"learning_rate": 1.3179769560891837e-05,
"loss": 0.6158,
"step": 2640
},
{
"epoch": 1.3747401247401247,
"grad_norm": 0.5075630462180919,
"learning_rate": 1.315108218023621e-05,
"loss": 0.6157,
"step": 2645
},
{
"epoch": 1.3773388773388773,
"grad_norm": 0.5278204198500884,
"learning_rate": 1.3122365979495259e-05,
"loss": 0.611,
"step": 2650
},
{
"epoch": 1.37993762993763,
"grad_norm": 0.5830494022632724,
"learning_rate": 1.3093621221309982e-05,
"loss": 0.6226,
"step": 2655
},
{
"epoch": 1.3825363825363826,
"grad_norm": 0.5567019594449695,
"learning_rate": 1.3064848168582562e-05,
"loss": 0.6128,
"step": 2660
},
{
"epoch": 1.385135135135135,
"grad_norm": 0.5218600131647313,
"learning_rate": 1.3036047084473964e-05,
"loss": 0.6164,
"step": 2665
},
{
"epoch": 1.3877338877338876,
"grad_norm": 0.5550941890937359,
"learning_rate": 1.3007218232401535e-05,
"loss": 0.6178,
"step": 2670
},
{
"epoch": 1.3903326403326404,
"grad_norm": 0.5140778619937807,
"learning_rate": 1.2978361876036586e-05,
"loss": 0.6015,
"step": 2675
},
{
"epoch": 1.392931392931393,
"grad_norm": 0.5704426484745836,
"learning_rate": 1.2949478279301993e-05,
"loss": 0.6218,
"step": 2680
},
{
"epoch": 1.3955301455301456,
"grad_norm": 0.573333768381573,
"learning_rate": 1.292056770636976e-05,
"loss": 0.6195,
"step": 2685
},
{
"epoch": 1.3981288981288982,
"grad_norm": 0.5463535484803559,
"learning_rate": 1.2891630421658631e-05,
"loss": 0.619,
"step": 2690
},
{
"epoch": 1.4007276507276507,
"grad_norm": 0.5239768140578435,
"learning_rate": 1.2862666689831655e-05,
"loss": 0.5988,
"step": 2695
},
{
"epoch": 1.4033264033264032,
"grad_norm": 0.5254212957357791,
"learning_rate": 1.2833676775793766e-05,
"loss": 0.6089,
"step": 2700
},
{
"epoch": 1.405925155925156,
"grad_norm": 0.4999540179579075,
"learning_rate": 1.2804660944689368e-05,
"loss": 0.6161,
"step": 2705
},
{
"epoch": 1.4085239085239085,
"grad_norm": 0.5566115132096349,
"learning_rate": 1.2775619461899896e-05,
"loss": 0.6182,
"step": 2710
},
{
"epoch": 1.411122661122661,
"grad_norm": 0.5740812598543206,
"learning_rate": 1.2746552593041405e-05,
"loss": 0.598,
"step": 2715
},
{
"epoch": 1.4137214137214138,
"grad_norm": 0.5437551314682787,
"learning_rate": 1.2717460603962132e-05,
"loss": 0.609,
"step": 2720
},
{
"epoch": 1.4163201663201663,
"grad_norm": 0.527003171395807,
"learning_rate": 1.268834376074007e-05,
"loss": 0.6097,
"step": 2725
},
{
"epoch": 1.4189189189189189,
"grad_norm": 0.5151895053958203,
"learning_rate": 1.2659202329680515e-05,
"loss": 0.6223,
"step": 2730
},
{
"epoch": 1.4215176715176714,
"grad_norm": 0.5220435941255479,
"learning_rate": 1.2630036577313667e-05,
"loss": 0.6273,
"step": 2735
},
{
"epoch": 1.4241164241164241,
"grad_norm": 0.5541408035311566,
"learning_rate": 1.2600846770392155e-05,
"loss": 0.6115,
"step": 2740
},
{
"epoch": 1.4267151767151767,
"grad_norm": 0.5183266262374772,
"learning_rate": 1.2571633175888618e-05,
"loss": 0.6098,
"step": 2745
},
{
"epoch": 1.4293139293139294,
"grad_norm": 0.5467945168613629,
"learning_rate": 1.2542396060993256e-05,
"loss": 0.6129,
"step": 2750
},
{
"epoch": 1.431912681912682,
"grad_norm": 0.5408402850999704,
"learning_rate": 1.2513135693111399e-05,
"loss": 0.6113,
"step": 2755
},
{
"epoch": 1.4345114345114345,
"grad_norm": 0.5481669387572653,
"learning_rate": 1.2483852339861033e-05,
"loss": 0.6032,
"step": 2760
},
{
"epoch": 1.437110187110187,
"grad_norm": 0.5292679934908046,
"learning_rate": 1.2454546269070392e-05,
"loss": 0.6037,
"step": 2765
},
{
"epoch": 1.4397089397089398,
"grad_norm": 0.5744822983902161,
"learning_rate": 1.2425217748775464e-05,
"loss": 0.6099,
"step": 2770
},
{
"epoch": 1.4423076923076923,
"grad_norm": 0.5030366381929183,
"learning_rate": 1.239586704721758e-05,
"loss": 0.6067,
"step": 2775
},
{
"epoch": 1.444906444906445,
"grad_norm": 0.5833985268491657,
"learning_rate": 1.2366494432840937e-05,
"loss": 0.6039,
"step": 2780
},
{
"epoch": 1.4475051975051976,
"grad_norm": 0.5747742162047574,
"learning_rate": 1.2337100174290142e-05,
"loss": 0.6101,
"step": 2785
},
{
"epoch": 1.45010395010395,
"grad_norm": 0.5356407427398536,
"learning_rate": 1.2307684540407775e-05,
"loss": 0.6055,
"step": 2790
},
{
"epoch": 1.4527027027027026,
"grad_norm": 0.5413902409510034,
"learning_rate": 1.2278247800231901e-05,
"loss": 0.6162,
"step": 2795
},
{
"epoch": 1.4553014553014554,
"grad_norm": 0.5361345781691861,
"learning_rate": 1.2248790222993639e-05,
"loss": 0.6132,
"step": 2800
},
{
"epoch": 1.457900207900208,
"grad_norm": 0.48977234406410547,
"learning_rate": 1.221931207811468e-05,
"loss": 0.619,
"step": 2805
},
{
"epoch": 1.4604989604989604,
"grad_norm": 0.5539199421254352,
"learning_rate": 1.2189813635204825e-05,
"loss": 0.6034,
"step": 2810
},
{
"epoch": 1.4630977130977132,
"grad_norm": 0.5274980068953669,
"learning_rate": 1.2160295164059529e-05,
"loss": 0.6076,
"step": 2815
},
{
"epoch": 1.4656964656964657,
"grad_norm": 0.5081900105077334,
"learning_rate": 1.2130756934657424e-05,
"loss": 0.6097,
"step": 2820
},
{
"epoch": 1.4682952182952183,
"grad_norm": 0.5619754096937638,
"learning_rate": 1.210119921715785e-05,
"loss": 0.6156,
"step": 2825
},
{
"epoch": 1.4708939708939708,
"grad_norm": 0.5058475060346515,
"learning_rate": 1.2071622281898394e-05,
"loss": 0.6119,
"step": 2830
},
{
"epoch": 1.4734927234927235,
"grad_norm": 0.528937107568451,
"learning_rate": 1.2042026399392403e-05,
"loss": 0.6034,
"step": 2835
},
{
"epoch": 1.476091476091476,
"grad_norm": 0.5585222059699902,
"learning_rate": 1.2012411840326524e-05,
"loss": 0.6122,
"step": 2840
},
{
"epoch": 1.4786902286902288,
"grad_norm": 0.5474471042332577,
"learning_rate": 1.1982778875558215e-05,
"loss": 0.5978,
"step": 2845
},
{
"epoch": 1.4812889812889813,
"grad_norm": 0.5637920526811849,
"learning_rate": 1.1953127776113279e-05,
"loss": 0.6097,
"step": 2850
},
{
"epoch": 1.4838877338877339,
"grad_norm": 0.5153160827226365,
"learning_rate": 1.192345881318338e-05,
"loss": 0.6065,
"step": 2855
},
{
"epoch": 1.4864864864864864,
"grad_norm": 0.5089185825931368,
"learning_rate": 1.1893772258123554e-05,
"loss": 0.5955,
"step": 2860
},
{
"epoch": 1.4890852390852392,
"grad_norm": 0.5284121779832783,
"learning_rate": 1.1864068382449756e-05,
"loss": 0.6088,
"step": 2865
},
{
"epoch": 1.4916839916839917,
"grad_norm": 0.5231059878227796,
"learning_rate": 1.1834347457836337e-05,
"loss": 0.5976,
"step": 2870
},
{
"epoch": 1.4942827442827442,
"grad_norm": 0.5517740731632155,
"learning_rate": 1.180460975611359e-05,
"loss": 0.613,
"step": 2875
},
{
"epoch": 1.496881496881497,
"grad_norm": 0.46831838517285146,
"learning_rate": 1.1774855549265245e-05,
"loss": 0.6053,
"step": 2880
},
{
"epoch": 1.4994802494802495,
"grad_norm": 0.527557394883835,
"learning_rate": 1.1745085109426002e-05,
"loss": 0.6174,
"step": 2885
},
{
"epoch": 1.502079002079002,
"grad_norm": 0.5200048942038921,
"learning_rate": 1.171529870887902e-05,
"loss": 0.6066,
"step": 2890
},
{
"epoch": 1.5046777546777546,
"grad_norm": 0.5460408265611407,
"learning_rate": 1.1685496620053434e-05,
"loss": 0.6122,
"step": 2895
},
{
"epoch": 1.5072765072765073,
"grad_norm": 0.5171487101859985,
"learning_rate": 1.165567911552187e-05,
"loss": 0.607,
"step": 2900
},
{
"epoch": 1.5098752598752598,
"grad_norm": 0.5082429135678129,
"learning_rate": 1.1625846467997952e-05,
"loss": 0.6118,
"step": 2905
},
{
"epoch": 1.5124740124740126,
"grad_norm": 0.536744119246903,
"learning_rate": 1.1595998950333794e-05,
"loss": 0.6228,
"step": 2910
},
{
"epoch": 1.5150727650727651,
"grad_norm": 0.5540864582315153,
"learning_rate": 1.1566136835517518e-05,
"loss": 0.6085,
"step": 2915
},
{
"epoch": 1.5176715176715176,
"grad_norm": 0.5480519199954694,
"learning_rate": 1.1536260396670753e-05,
"loss": 0.6038,
"step": 2920
},
{
"epoch": 1.5202702702702702,
"grad_norm": 0.5320678068411181,
"learning_rate": 1.1506369907046135e-05,
"loss": 0.6027,
"step": 2925
},
{
"epoch": 1.5228690228690227,
"grad_norm": 0.5559206845902772,
"learning_rate": 1.1476465640024814e-05,
"loss": 0.6082,
"step": 2930
},
{
"epoch": 1.5254677754677755,
"grad_norm": 0.5919814949422626,
"learning_rate": 1.1446547869113944e-05,
"loss": 0.5897,
"step": 2935
},
{
"epoch": 1.5280665280665282,
"grad_norm": 0.5327268055659626,
"learning_rate": 1.1416616867944192e-05,
"loss": 0.611,
"step": 2940
},
{
"epoch": 1.5306652806652807,
"grad_norm": 0.4971186426325191,
"learning_rate": 1.1386672910267225e-05,
"loss": 0.6101,
"step": 2945
},
{
"epoch": 1.5332640332640333,
"grad_norm": 0.5640128227568957,
"learning_rate": 1.1356716269953213e-05,
"loss": 0.6199,
"step": 2950
},
{
"epoch": 1.5358627858627858,
"grad_norm": 0.5179662541283063,
"learning_rate": 1.1326747220988327e-05,
"loss": 0.6202,
"step": 2955
},
{
"epoch": 1.5384615384615383,
"grad_norm": 0.6423145905392057,
"learning_rate": 1.1296766037472223e-05,
"loss": 0.6144,
"step": 2960
},
{
"epoch": 1.541060291060291,
"grad_norm": 0.5256505864598588,
"learning_rate": 1.1266772993615543e-05,
"loss": 0.6066,
"step": 2965
},
{
"epoch": 1.5436590436590436,
"grad_norm": 0.5209882272221003,
"learning_rate": 1.1236768363737408e-05,
"loss": 0.613,
"step": 2970
},
{
"epoch": 1.5462577962577964,
"grad_norm": 0.5139682181751073,
"learning_rate": 1.120675242226289e-05,
"loss": 0.6195,
"step": 2975
},
{
"epoch": 1.5488565488565489,
"grad_norm": 0.5285679185697464,
"learning_rate": 1.1176725443720545e-05,
"loss": 0.6074,
"step": 2980
},
{
"epoch": 1.5514553014553014,
"grad_norm": 0.5176763822468469,
"learning_rate": 1.1146687702739855e-05,
"loss": 0.6225,
"step": 2985
},
{
"epoch": 1.554054054054054,
"grad_norm": 0.5346252383786081,
"learning_rate": 1.1116639474048741e-05,
"loss": 0.5955,
"step": 2990
},
{
"epoch": 1.5566528066528067,
"grad_norm": 0.5246377509399082,
"learning_rate": 1.108658103247104e-05,
"loss": 0.6075,
"step": 2995
},
{
"epoch": 1.5592515592515592,
"grad_norm": 0.5852349160305579,
"learning_rate": 1.1056512652924014e-05,
"loss": 0.6102,
"step": 3000
},
{
"epoch": 1.561850311850312,
"grad_norm": 0.5540954218703817,
"learning_rate": 1.1026434610415804e-05,
"loss": 0.6073,
"step": 3005
},
{
"epoch": 1.5644490644490645,
"grad_norm": 0.516164831755444,
"learning_rate": 1.099634718004293e-05,
"loss": 0.6144,
"step": 3010
},
{
"epoch": 1.567047817047817,
"grad_norm": 0.5238437043105261,
"learning_rate": 1.0966250636987776e-05,
"loss": 0.61,
"step": 3015
},
{
"epoch": 1.5696465696465696,
"grad_norm": 0.5499703346154395,
"learning_rate": 1.093614525651608e-05,
"loss": 0.6,
"step": 3020
},
{
"epoch": 1.572245322245322,
"grad_norm": 0.5392038397492541,
"learning_rate": 1.0906031313974392e-05,
"loss": 0.6004,
"step": 3025
},
{
"epoch": 1.5748440748440748,
"grad_norm": 0.5440366683585401,
"learning_rate": 1.0875909084787586e-05,
"loss": 0.6079,
"step": 3030
},
{
"epoch": 1.5774428274428276,
"grad_norm": 0.5280604613144251,
"learning_rate": 1.0845778844456319e-05,
"loss": 0.6028,
"step": 3035
},
{
"epoch": 1.5800415800415801,
"grad_norm": 0.5130988979787711,
"learning_rate": 1.0815640868554518e-05,
"loss": 0.6255,
"step": 3040
},
{
"epoch": 1.5826403326403327,
"grad_norm": 0.5347614455862642,
"learning_rate": 1.0785495432726864e-05,
"loss": 0.6144,
"step": 3045
},
{
"epoch": 1.5852390852390852,
"grad_norm": 0.5540466808635207,
"learning_rate": 1.0755342812686264e-05,
"loss": 0.618,
"step": 3050
},
{
"epoch": 1.5878378378378377,
"grad_norm": 0.5158267468916651,
"learning_rate": 1.0725183284211335e-05,
"loss": 0.6054,
"step": 3055
},
{
"epoch": 1.5904365904365905,
"grad_norm": 0.5235550308126831,
"learning_rate": 1.0695017123143881e-05,
"loss": 0.6113,
"step": 3060
},
{
"epoch": 1.593035343035343,
"grad_norm": 0.49676274074318394,
"learning_rate": 1.0664844605386357e-05,
"loss": 0.6066,
"step": 3065
},
{
"epoch": 1.5956340956340958,
"grad_norm": 0.5091724259037824,
"learning_rate": 1.0634666006899375e-05,
"loss": 0.6059,
"step": 3070
},
{
"epoch": 1.5982328482328483,
"grad_norm": 0.5308888501073562,
"learning_rate": 1.0604481603699146e-05,
"loss": 0.6077,
"step": 3075
},
{
"epoch": 1.6008316008316008,
"grad_norm": 0.5668118121411413,
"learning_rate": 1.0574291671854979e-05,
"loss": 0.6119,
"step": 3080
},
{
"epoch": 1.6034303534303533,
"grad_norm": 0.5232440524467463,
"learning_rate": 1.054409648748675e-05,
"loss": 0.6132,
"step": 3085
},
{
"epoch": 1.6060291060291059,
"grad_norm": 0.5326956732038823,
"learning_rate": 1.0513896326762363e-05,
"loss": 0.5957,
"step": 3090
},
{
"epoch": 1.6086278586278586,
"grad_norm": 0.5376136523378364,
"learning_rate": 1.0483691465895256e-05,
"loss": 0.5963,
"step": 3095
},
{
"epoch": 1.6112266112266114,
"grad_norm": 0.5590406644575509,
"learning_rate": 1.0453482181141838e-05,
"loss": 0.6114,
"step": 3100
},
{
"epoch": 1.613825363825364,
"grad_norm": 0.5348933441437478,
"learning_rate": 1.0423268748798992e-05,
"loss": 0.626,
"step": 3105
},
{
"epoch": 1.6164241164241164,
"grad_norm": 0.5587808171684693,
"learning_rate": 1.0393051445201518e-05,
"loss": 0.6035,
"step": 3110
},
{
"epoch": 1.619022869022869,
"grad_norm": 0.5217308721418593,
"learning_rate": 1.0362830546719644e-05,
"loss": 0.6007,
"step": 3115
},
{
"epoch": 1.6216216216216215,
"grad_norm": 0.5331440823163403,
"learning_rate": 1.0332606329756463e-05,
"loss": 0.6103,
"step": 3120
},
{
"epoch": 1.6242203742203742,
"grad_norm": 0.5354516402513061,
"learning_rate": 1.030237907074542e-05,
"loss": 0.6021,
"step": 3125
},
{
"epoch": 1.6268191268191268,
"grad_norm": 0.5301206175827867,
"learning_rate": 1.0272149046147788e-05,
"loss": 0.6032,
"step": 3130
},
{
"epoch": 1.6294178794178795,
"grad_norm": 0.5364702146724981,
"learning_rate": 1.0241916532450133e-05,
"loss": 0.6107,
"step": 3135
},
{
"epoch": 1.632016632016632,
"grad_norm": 0.5030704592075379,
"learning_rate": 1.0211681806161787e-05,
"loss": 0.5984,
"step": 3140
},
{
"epoch": 1.6346153846153846,
"grad_norm": 0.5001028568491547,
"learning_rate": 1.0181445143812312e-05,
"loss": 0.6011,
"step": 3145
},
{
"epoch": 1.637214137214137,
"grad_norm": 0.5537298706648461,
"learning_rate": 1.0151206821948985e-05,
"loss": 0.6348,
"step": 3150
},
{
"epoch": 1.6398128898128899,
"grad_norm": 0.5499538795880998,
"learning_rate": 1.0120967117134262e-05,
"loss": 0.6163,
"step": 3155
},
{
"epoch": 1.6424116424116424,
"grad_norm": 0.4944029513235786,
"learning_rate": 1.009072630594324e-05,
"loss": 0.5997,
"step": 3160
},
{
"epoch": 1.6450103950103951,
"grad_norm": 0.5560023248781629,
"learning_rate": 1.0060484664961136e-05,
"loss": 0.6066,
"step": 3165
},
{
"epoch": 1.6476091476091477,
"grad_norm": 0.5228794932020453,
"learning_rate": 1.0030242470780769e-05,
"loss": 0.6049,
"step": 3170
},
{
"epoch": 1.6502079002079002,
"grad_norm": 0.5251096124443742,
"learning_rate": 1e-05,
"loss": 0.617,
"step": 3175
},
{
"epoch": 1.6528066528066527,
"grad_norm": 0.504719489023802,
"learning_rate": 9.969757529219236e-06,
"loss": 0.611,
"step": 3180
},
{
"epoch": 1.6554054054054053,
"grad_norm": 0.5164130013232197,
"learning_rate": 9.939515335038866e-06,
"loss": 0.6071,
"step": 3185
},
{
"epoch": 1.658004158004158,
"grad_norm": 0.503984804974549,
"learning_rate": 9.909273694056765e-06,
"loss": 0.6098,
"step": 3190
},
{
"epoch": 1.6606029106029108,
"grad_norm": 0.5318145254626715,
"learning_rate": 9.879032882865745e-06,
"loss": 0.6046,
"step": 3195
},
{
"epoch": 1.6632016632016633,
"grad_norm": 0.49979486457828537,
"learning_rate": 9.848793178051017e-06,
"loss": 0.5942,
"step": 3200
},
{
"epoch": 1.6658004158004158,
"grad_norm": 0.5222561724594693,
"learning_rate": 9.818554856187692e-06,
"loss": 0.6102,
"step": 3205
},
{
"epoch": 1.6683991683991684,
"grad_norm": 0.5119064608955575,
"learning_rate": 9.788318193838218e-06,
"loss": 0.6063,
"step": 3210
},
{
"epoch": 1.6709979209979209,
"grad_norm": 0.49188265798150393,
"learning_rate": 9.758083467549868e-06,
"loss": 0.6007,
"step": 3215
},
{
"epoch": 1.6735966735966736,
"grad_norm": 0.5307992559310489,
"learning_rate": 9.727850953852217e-06,
"loss": 0.6037,
"step": 3220
},
{
"epoch": 1.6761954261954262,
"grad_norm": 0.5456235977768752,
"learning_rate": 9.697620929254584e-06,
"loss": 0.6244,
"step": 3225
},
{
"epoch": 1.678794178794179,
"grad_norm": 0.5088649958340964,
"learning_rate": 9.66739367024354e-06,
"loss": 0.6042,
"step": 3230
},
{
"epoch": 1.6813929313929314,
"grad_norm": 0.4953639561715028,
"learning_rate": 9.63716945328036e-06,
"loss": 0.5938,
"step": 3235
},
{
"epoch": 1.683991683991684,
"grad_norm": 0.49505908823955036,
"learning_rate": 9.606948554798482e-06,
"loss": 0.6144,
"step": 3240
},
{
"epoch": 1.6865904365904365,
"grad_norm": 0.5175987592879167,
"learning_rate": 9.57673125120101e-06,
"loss": 0.6098,
"step": 3245
},
{
"epoch": 1.689189189189189,
"grad_norm": 0.5388656862756696,
"learning_rate": 9.546517818858164e-06,
"loss": 0.6171,
"step": 3250
},
{
"epoch": 1.6917879417879418,
"grad_norm": 0.5153249162580613,
"learning_rate": 9.516308534104744e-06,
"loss": 0.5923,
"step": 3255
},
{
"epoch": 1.6943866943866945,
"grad_norm": 0.5185938318204056,
"learning_rate": 9.486103673237638e-06,
"loss": 0.589,
"step": 3260
},
{
"epoch": 1.696985446985447,
"grad_norm": 0.5735162818769731,
"learning_rate": 9.455903512513257e-06,
"loss": 0.6199,
"step": 3265
},
{
"epoch": 1.6995841995841996,
"grad_norm": 0.5550865974556703,
"learning_rate": 9.425708328145023e-06,
"loss": 0.603,
"step": 3270
},
{
"epoch": 1.7021829521829521,
"grad_norm": 0.5656039764802955,
"learning_rate": 9.395518396300857e-06,
"loss": 0.6036,
"step": 3275
},
{
"epoch": 1.7047817047817047,
"grad_norm": 0.5356649490240522,
"learning_rate": 9.365333993100628e-06,
"loss": 0.5951,
"step": 3280
},
{
"epoch": 1.7073804573804574,
"grad_norm": 0.5715455882322491,
"learning_rate": 9.335155394613641e-06,
"loss": 0.5989,
"step": 3285
},
{
"epoch": 1.70997920997921,
"grad_norm": 0.5497264191896297,
"learning_rate": 9.304982876856124e-06,
"loss": 0.6058,
"step": 3290
},
{
"epoch": 1.7125779625779627,
"grad_norm": 0.5051026018528313,
"learning_rate": 9.274816715788668e-06,
"loss": 0.5969,
"step": 3295
},
{
"epoch": 1.7151767151767152,
"grad_norm": 0.5204788821196659,
"learning_rate": 9.244657187313739e-06,
"loss": 0.611,
"step": 3300
},
{
"epoch": 1.7177754677754677,
"grad_norm": 0.48242559624890763,
"learning_rate": 9.214504567273139e-06,
"loss": 0.5893,
"step": 3305
},
{
"epoch": 1.7203742203742203,
"grad_norm": 0.5475293749782204,
"learning_rate": 9.184359131445487e-06,
"loss": 0.6128,
"step": 3310
},
{
"epoch": 1.722972972972973,
"grad_norm": 0.5227512974575209,
"learning_rate": 9.154221155543684e-06,
"loss": 0.5942,
"step": 3315
},
{
"epoch": 1.7255717255717256,
"grad_norm": 0.517359580415827,
"learning_rate": 9.124090915212415e-06,
"loss": 0.5995,
"step": 3320
},
{
"epoch": 1.7281704781704783,
"grad_norm": 0.5305121769843365,
"learning_rate": 9.093968686025612e-06,
"loss": 0.618,
"step": 3325
},
{
"epoch": 1.7307692307692308,
"grad_norm": 0.5312713830882955,
"learning_rate": 9.063854743483924e-06,
"loss": 0.5929,
"step": 3330
},
{
"epoch": 1.7333679833679834,
"grad_norm": 0.5159635712284465,
"learning_rate": 9.033749363012228e-06,
"loss": 0.5942,
"step": 3335
},
{
"epoch": 1.735966735966736,
"grad_norm": 0.5304955336277648,
"learning_rate": 9.003652819957073e-06,
"loss": 0.5955,
"step": 3340
},
{
"epoch": 1.7385654885654884,
"grad_norm": 0.5306004926529849,
"learning_rate": 8.973565389584199e-06,
"loss": 0.6157,
"step": 3345
},
{
"epoch": 1.7411642411642412,
"grad_norm": 0.5287399261745209,
"learning_rate": 8.943487347075988e-06,
"loss": 0.5867,
"step": 3350
},
{
"epoch": 1.743762993762994,
"grad_norm": 0.5230774773864855,
"learning_rate": 8.91341896752896e-06,
"loss": 0.5894,
"step": 3355
},
{
"epoch": 1.7463617463617465,
"grad_norm": 0.5155471622168707,
"learning_rate": 8.883360525951264e-06,
"loss": 0.5958,
"step": 3360
},
{
"epoch": 1.748960498960499,
"grad_norm": 0.5133214239778116,
"learning_rate": 8.85331229726015e-06,
"loss": 0.5935,
"step": 3365
},
{
"epoch": 1.7515592515592515,
"grad_norm": 0.5319975207166266,
"learning_rate": 8.823274556279455e-06,
"loss": 0.5934,
"step": 3370
},
{
"epoch": 1.754158004158004,
"grad_norm": 0.5364931909740585,
"learning_rate": 8.793247577737112e-06,
"loss": 0.6055,
"step": 3375
},
{
"epoch": 1.7567567567567568,
"grad_norm": 0.5093682789742844,
"learning_rate": 8.763231636262599e-06,
"loss": 0.5904,
"step": 3380
},
{
"epoch": 1.7593555093555093,
"grad_norm": 0.49218365344373355,
"learning_rate": 8.733227006384459e-06,
"loss": 0.6045,
"step": 3385
},
{
"epoch": 1.761954261954262,
"grad_norm": 0.5463702062588134,
"learning_rate": 8.703233962527779e-06,
"loss": 0.6039,
"step": 3390
},
{
"epoch": 1.7645530145530146,
"grad_norm": 0.5102092525737645,
"learning_rate": 8.673252779011676e-06,
"loss": 0.5887,
"step": 3395
},
{
"epoch": 1.7671517671517671,
"grad_norm": 0.5268210778389424,
"learning_rate": 8.643283730046788e-06,
"loss": 0.5983,
"step": 3400
},
{
"epoch": 1.7697505197505197,
"grad_norm": 0.5098708018226924,
"learning_rate": 8.61332708973278e-06,
"loss": 0.6043,
"step": 3405
},
{
"epoch": 1.7723492723492722,
"grad_norm": 0.48835524185569673,
"learning_rate": 8.583383132055814e-06,
"loss": 0.6107,
"step": 3410
},
{
"epoch": 1.774948024948025,
"grad_norm": 0.5701236303096751,
"learning_rate": 8.55345213088606e-06,
"loss": 0.6033,
"step": 3415
},
{
"epoch": 1.7775467775467777,
"grad_norm": 0.5137867247566509,
"learning_rate": 8.52353435997519e-06,
"loss": 0.5988,
"step": 3420
},
{
"epoch": 1.7801455301455302,
"grad_norm": 0.5185967787599991,
"learning_rate": 8.49363009295387e-06,
"loss": 0.6027,
"step": 3425
},
{
"epoch": 1.7827442827442828,
"grad_norm": 0.5232087879326293,
"learning_rate": 8.46373960332925e-06,
"loss": 0.5958,
"step": 3430
},
{
"epoch": 1.7853430353430353,
"grad_norm": 0.5227750785275999,
"learning_rate": 8.433863164482485e-06,
"loss": 0.6087,
"step": 3435
},
{
"epoch": 1.7879417879417878,
"grad_norm": 0.4796440456103048,
"learning_rate": 8.404001049666211e-06,
"loss": 0.5961,
"step": 3440
},
{
"epoch": 1.7905405405405406,
"grad_norm": 0.5114161067261779,
"learning_rate": 8.37415353200205e-06,
"loss": 0.5975,
"step": 3445
},
{
"epoch": 1.793139293139293,
"grad_norm": 0.5368539216036579,
"learning_rate": 8.344320884478133e-06,
"loss": 0.5995,
"step": 3450
},
{
"epoch": 1.7957380457380459,
"grad_norm": 0.5251230847938383,
"learning_rate": 8.314503379946569e-06,
"loss": 0.5924,
"step": 3455
},
{
"epoch": 1.7983367983367984,
"grad_norm": 0.5125606084891738,
"learning_rate": 8.284701291120984e-06,
"loss": 0.59,
"step": 3460
},
{
"epoch": 1.800935550935551,
"grad_norm": 0.5082724750112706,
"learning_rate": 8.254914890574001e-06,
"loss": 0.5783,
"step": 3465
},
{
"epoch": 1.8035343035343034,
"grad_norm": 0.5857171673424286,
"learning_rate": 8.225144450734755e-06,
"loss": 0.6159,
"step": 3470
},
{
"epoch": 1.806133056133056,
"grad_norm": 0.5189085809502059,
"learning_rate": 8.195390243886414e-06,
"loss": 0.5876,
"step": 3475
},
{
"epoch": 1.8087318087318087,
"grad_norm": 0.5054176942242024,
"learning_rate": 8.165652542163668e-06,
"loss": 0.6018,
"step": 3480
},
{
"epoch": 1.8113305613305615,
"grad_norm": 0.5245871555142563,
"learning_rate": 8.135931617550245e-06,
"loss": 0.607,
"step": 3485
},
{
"epoch": 1.813929313929314,
"grad_norm": 0.5240802764153503,
"learning_rate": 8.106227741876447e-06,
"loss": 0.6074,
"step": 3490
},
{
"epoch": 1.8165280665280665,
"grad_norm": 0.5431345881991243,
"learning_rate": 8.076541186816625e-06,
"loss": 0.6002,
"step": 3495
},
{
"epoch": 1.819126819126819,
"grad_norm": 0.5192080223913004,
"learning_rate": 8.046872223886723e-06,
"loss": 0.6039,
"step": 3500
},
{
"epoch": 1.8217255717255716,
"grad_norm": 0.5377132118040553,
"learning_rate": 8.017221124441787e-06,
"loss": 0.5866,
"step": 3505
},
{
"epoch": 1.8243243243243243,
"grad_norm": 0.4848076231447858,
"learning_rate": 7.98758815967348e-06,
"loss": 0.5926,
"step": 3510
},
{
"epoch": 1.8269230769230769,
"grad_norm": 0.49613490454069115,
"learning_rate": 7.957973600607597e-06,
"loss": 0.6029,
"step": 3515
},
{
"epoch": 1.8295218295218296,
"grad_norm": 0.49532299518482037,
"learning_rate": 7.92837771810161e-06,
"loss": 0.5893,
"step": 3520
},
{
"epoch": 1.8321205821205822,
"grad_norm": 0.5240393625504302,
"learning_rate": 7.898800782842153e-06,
"loss": 0.6044,
"step": 3525
},
{
"epoch": 1.8347193347193347,
"grad_norm": 0.5378508353167911,
"learning_rate": 7.86924306534258e-06,
"loss": 0.5892,
"step": 3530
},
{
"epoch": 1.8373180873180872,
"grad_norm": 0.5199976481026775,
"learning_rate": 7.839704835940473e-06,
"loss": 0.5982,
"step": 3535
},
{
"epoch": 1.83991683991684,
"grad_norm": 0.529729165924642,
"learning_rate": 7.81018636479518e-06,
"loss": 0.6012,
"step": 3540
},
{
"epoch": 1.8425155925155925,
"grad_norm": 0.4916121082024032,
"learning_rate": 7.780687921885324e-06,
"loss": 0.5977,
"step": 3545
},
{
"epoch": 1.8451143451143452,
"grad_norm": 0.5575500880550704,
"learning_rate": 7.751209777006363e-06,
"loss": 0.6003,
"step": 3550
},
{
"epoch": 1.8477130977130978,
"grad_norm": 0.5326362594297853,
"learning_rate": 7.7217521997681e-06,
"loss": 0.6039,
"step": 3555
},
{
"epoch": 1.8503118503118503,
"grad_norm": 0.6375012050960875,
"learning_rate": 7.69231545959223e-06,
"loss": 0.5903,
"step": 3560
},
{
"epoch": 1.8529106029106028,
"grad_norm": 0.5127398317219315,
"learning_rate": 7.66289982570986e-06,
"loss": 0.6127,
"step": 3565
},
{
"epoch": 1.8555093555093554,
"grad_norm": 0.5337342117702417,
"learning_rate": 7.633505567159068e-06,
"loss": 0.6106,
"step": 3570
},
{
"epoch": 1.8581081081081081,
"grad_norm": 0.5127146513499672,
"learning_rate": 7.604132952782421e-06,
"loss": 0.593,
"step": 3575
},
{
"epoch": 1.8607068607068609,
"grad_norm": 0.5475181259322507,
"learning_rate": 7.574782251224541e-06,
"loss": 0.6087,
"step": 3580
},
{
"epoch": 1.8633056133056134,
"grad_norm": 0.5057492212324644,
"learning_rate": 7.545453730929612e-06,
"loss": 0.5961,
"step": 3585
},
{
"epoch": 1.865904365904366,
"grad_norm": 0.5313703879609416,
"learning_rate": 7.516147660138968e-06,
"loss": 0.5826,
"step": 3590
},
{
"epoch": 1.8685031185031185,
"grad_norm": 0.5198552151529012,
"learning_rate": 7.486864306888608e-06,
"loss": 0.6015,
"step": 3595
},
{
"epoch": 1.871101871101871,
"grad_norm": 0.5410281063432927,
"learning_rate": 7.457603939006745e-06,
"loss": 0.6033,
"step": 3600
},
{
"epoch": 1.8737006237006237,
"grad_norm": 0.5557013181252524,
"learning_rate": 7.428366824111386e-06,
"loss": 0.5902,
"step": 3605
},
{
"epoch": 1.8762993762993763,
"grad_norm": 0.5633083487264265,
"learning_rate": 7.399153229607849e-06,
"loss": 0.6018,
"step": 3610
},
{
"epoch": 1.878898128898129,
"grad_norm": 0.48595216285541615,
"learning_rate": 7.369963422686335e-06,
"loss": 0.594,
"step": 3615
},
{
"epoch": 1.8814968814968815,
"grad_norm": 0.5169705729740565,
"learning_rate": 7.340797670319488e-06,
"loss": 0.5899,
"step": 3620
},
{
"epoch": 1.884095634095634,
"grad_norm": 0.4970606877334214,
"learning_rate": 7.311656239259934e-06,
"loss": 0.6148,
"step": 3625
},
{
"epoch": 1.8866943866943866,
"grad_norm": 0.5178433096469348,
"learning_rate": 7.282539396037868e-06,
"loss": 0.59,
"step": 3630
},
{
"epoch": 1.8892931392931391,
"grad_norm": 0.5085741805913727,
"learning_rate": 7.253447406958598e-06,
"loss": 0.5969,
"step": 3635
},
{
"epoch": 1.8918918918918919,
"grad_norm": 0.5339633821078309,
"learning_rate": 7.2243805381001084e-06,
"loss": 0.6013,
"step": 3640
},
{
"epoch": 1.8944906444906446,
"grad_norm": 0.5142299591444427,
"learning_rate": 7.195339055310635e-06,
"loss": 0.605,
"step": 3645
},
{
"epoch": 1.8970893970893972,
"grad_norm": 0.53012102257086,
"learning_rate": 7.166323224206236e-06,
"loss": 0.5934,
"step": 3650
},
{
"epoch": 1.8996881496881497,
"grad_norm": 0.5180950494011575,
"learning_rate": 7.13733331016835e-06,
"loss": 0.5967,
"step": 3655
},
{
"epoch": 1.9022869022869022,
"grad_norm": 0.5183177508817899,
"learning_rate": 7.108369578341372e-06,
"loss": 0.5823,
"step": 3660
},
{
"epoch": 1.9048856548856548,
"grad_norm": 0.5070875844600755,
"learning_rate": 7.079432293630244e-06,
"loss": 0.5956,
"step": 3665
},
{
"epoch": 1.9074844074844075,
"grad_norm": 0.6274267217296448,
"learning_rate": 7.050521720698009e-06,
"loss": 0.6114,
"step": 3670
},
{
"epoch": 1.91008316008316,
"grad_norm": 0.5602574620780145,
"learning_rate": 7.021638123963415e-06,
"loss": 0.586,
"step": 3675
},
{
"epoch": 1.9126819126819128,
"grad_norm": 0.517425035346988,
"learning_rate": 6.992781767598467e-06,
"loss": 0.5937,
"step": 3680
},
{
"epoch": 1.9152806652806653,
"grad_norm": 0.49392256995221356,
"learning_rate": 6.9639529155260355e-06,
"loss": 0.5893,
"step": 3685
},
{
"epoch": 1.9178794178794178,
"grad_norm": 0.5221453351667464,
"learning_rate": 6.935151831417442e-06,
"loss": 0.5921,
"step": 3690
},
{
"epoch": 1.9204781704781704,
"grad_norm": 0.5437297366337159,
"learning_rate": 6.906378778690023e-06,
"loss": 0.5941,
"step": 3695
},
{
"epoch": 1.9230769230769231,
"grad_norm": 0.5526585356985603,
"learning_rate": 6.8776340205047446e-06,
"loss": 0.5879,
"step": 3700
},
{
"epoch": 1.9256756756756757,
"grad_norm": 0.47075169096755787,
"learning_rate": 6.848917819763794e-06,
"loss": 0.587,
"step": 3705
},
{
"epoch": 1.9282744282744284,
"grad_norm": 0.5318393319444799,
"learning_rate": 6.8202304391081665e-06,
"loss": 0.5961,
"step": 3710
},
{
"epoch": 1.930873180873181,
"grad_norm": 0.5429244483259561,
"learning_rate": 6.791572140915258e-06,
"loss": 0.5972,
"step": 3715
},
{
"epoch": 1.9334719334719335,
"grad_norm": 0.7824999181116893,
"learning_rate": 6.762943187296487e-06,
"loss": 0.6025,
"step": 3720
},
{
"epoch": 1.936070686070686,
"grad_norm": 0.5506609069987528,
"learning_rate": 6.734343840094877e-06,
"loss": 0.5935,
"step": 3725
},
{
"epoch": 1.9386694386694385,
"grad_norm": 0.5393169028265578,
"learning_rate": 6.705774360882662e-06,
"loss": 0.5998,
"step": 3730
},
{
"epoch": 1.9412681912681913,
"grad_norm": 0.5196746072745972,
"learning_rate": 6.677235010958916e-06,
"loss": 0.6024,
"step": 3735
},
{
"epoch": 1.943866943866944,
"grad_norm": 0.5161195299496159,
"learning_rate": 6.648726051347132e-06,
"loss": 0.5923,
"step": 3740
},
{
"epoch": 1.9464656964656966,
"grad_norm": 0.5385756600061604,
"learning_rate": 6.6202477427928604e-06,
"loss": 0.5936,
"step": 3745
},
{
"epoch": 1.949064449064449,
"grad_norm": 0.5068176753311672,
"learning_rate": 6.591800345761313e-06,
"loss": 0.5857,
"step": 3750
},
{
"epoch": 1.9516632016632016,
"grad_norm": 0.5156660031341467,
"learning_rate": 6.563384120434978e-06,
"loss": 0.5998,
"step": 3755
},
{
"epoch": 1.9542619542619541,
"grad_norm": 0.5915967761576071,
"learning_rate": 6.5349993267112455e-06,
"loss": 0.5901,
"step": 3760
},
{
"epoch": 1.956860706860707,
"grad_norm": 0.5286753068152813,
"learning_rate": 6.506646224200036e-06,
"loss": 0.606,
"step": 3765
},
{
"epoch": 1.9594594594594594,
"grad_norm": 0.5510621595391834,
"learning_rate": 6.4783250722214066e-06,
"loss": 0.5996,
"step": 3770
},
{
"epoch": 1.9620582120582122,
"grad_norm": 0.5050260932024032,
"learning_rate": 6.450036129803205e-06,
"loss": 0.5811,
"step": 3775
},
{
"epoch": 1.9646569646569647,
"grad_norm": 0.5187426414977963,
"learning_rate": 6.42177965567868e-06,
"loss": 0.6012,
"step": 3780
},
{
"epoch": 1.9672557172557172,
"grad_norm": 0.5226860354335852,
"learning_rate": 6.393555908284119e-06,
"loss": 0.6002,
"step": 3785
},
{
"epoch": 1.9698544698544698,
"grad_norm": 0.5394751859696948,
"learning_rate": 6.3653651457565005e-06,
"loss": 0.6049,
"step": 3790
},
{
"epoch": 1.9724532224532223,
"grad_norm": 0.5041393147335839,
"learning_rate": 6.337207625931105e-06,
"loss": 0.5995,
"step": 3795
},
{
"epoch": 1.975051975051975,
"grad_norm": 0.533267009949286,
"learning_rate": 6.309083606339184e-06,
"loss": 0.5845,
"step": 3800
},
{
"epoch": 1.9776507276507278,
"grad_norm": 0.5376844644903337,
"learning_rate": 6.28099334420559e-06,
"loss": 0.5889,
"step": 3805
},
{
"epoch": 1.9802494802494803,
"grad_norm": 0.5424651587833641,
"learning_rate": 6.252937096446422e-06,
"loss": 0.5931,
"step": 3810
},
{
"epoch": 1.9828482328482329,
"grad_norm": 0.5323019777176436,
"learning_rate": 6.224915119666682e-06,
"loss": 0.6001,
"step": 3815
},
{
"epoch": 1.9854469854469854,
"grad_norm": 0.5719566751743559,
"learning_rate": 6.196927670157931e-06,
"loss": 0.5969,
"step": 3820
},
{
"epoch": 1.988045738045738,
"grad_norm": 0.5144348819478973,
"learning_rate": 6.168975003895939e-06,
"loss": 0.6027,
"step": 3825
},
{
"epoch": 1.9906444906444907,
"grad_norm": 0.5171213896946363,
"learning_rate": 6.141057376538338e-06,
"loss": 0.5986,
"step": 3830
},
{
"epoch": 1.9932432432432432,
"grad_norm": 0.5178977499722083,
"learning_rate": 6.113175043422301e-06,
"loss": 0.6069,
"step": 3835
},
{
"epoch": 1.995841995841996,
"grad_norm": 0.5264478858379251,
"learning_rate": 6.085328259562195e-06,
"loss": 0.5939,
"step": 3840
},
{
"epoch": 1.9984407484407485,
"grad_norm": 0.49088468990078843,
"learning_rate": 6.0575172796472405e-06,
"loss": 0.5899,
"step": 3845
},
{
"epoch": 2.0,
"eval_loss": 0.7568330764770508,
"eval_runtime": 106.5795,
"eval_samples_per_second": 77.032,
"eval_steps_per_second": 1.21,
"step": 3848
},
{
"epoch": 2.001039501039501,
"grad_norm": 0.6668925862814008,
"learning_rate": 6.0297423580392055e-06,
"loss": 0.5449,
"step": 3850
},
{
"epoch": 2.0036382536382535,
"grad_norm": 0.6496810654128746,
"learning_rate": 6.002003748770055e-06,
"loss": 0.5054,
"step": 3855
},
{
"epoch": 2.006237006237006,
"grad_norm": 0.6031011324917133,
"learning_rate": 5.9743017055396424e-06,
"loss": 0.508,
"step": 3860
},
{
"epoch": 2.008835758835759,
"grad_norm": 0.6010814236947867,
"learning_rate": 5.9466364817133886e-06,
"loss": 0.5042,
"step": 3865
},
{
"epoch": 2.0114345114345116,
"grad_norm": 0.5728886202402685,
"learning_rate": 5.9190083303199505e-06,
"loss": 0.5013,
"step": 3870
},
{
"epoch": 2.014033264033264,
"grad_norm": 0.5424146827083851,
"learning_rate": 5.891417504048926e-06,
"loss": 0.5075,
"step": 3875
},
{
"epoch": 2.0166320166320166,
"grad_norm": 0.5790413975893587,
"learning_rate": 5.863864255248533e-06,
"loss": 0.5179,
"step": 3880
},
{
"epoch": 2.019230769230769,
"grad_norm": 0.5318824862807435,
"learning_rate": 5.836348835923299e-06,
"loss": 0.5068,
"step": 3885
},
{
"epoch": 2.0218295218295217,
"grad_norm": 0.550883650215065,
"learning_rate": 5.808871497731758e-06,
"loss": 0.4974,
"step": 3890
},
{
"epoch": 2.024428274428274,
"grad_norm": 0.5879882582188948,
"learning_rate": 5.781432491984162e-06,
"loss": 0.5113,
"step": 3895
},
{
"epoch": 2.027027027027027,
"grad_norm": 0.5546832686816904,
"learning_rate": 5.754032069640153e-06,
"loss": 0.5063,
"step": 3900
},
{
"epoch": 2.0296257796257797,
"grad_norm": 0.5211604329798696,
"learning_rate": 5.726670481306505e-06,
"loss": 0.5052,
"step": 3905
},
{
"epoch": 2.0322245322245323,
"grad_norm": 0.5455878123275217,
"learning_rate": 5.699347977234799e-06,
"loss": 0.5053,
"step": 3910
},
{
"epoch": 2.034823284823285,
"grad_norm": 0.5950657410818389,
"learning_rate": 5.672064807319146e-06,
"loss": 0.5152,
"step": 3915
},
{
"epoch": 2.0374220374220373,
"grad_norm": 0.5858939065311778,
"learning_rate": 5.644821221093916e-06,
"loss": 0.5059,
"step": 3920
},
{
"epoch": 2.04002079002079,
"grad_norm": 0.5347458170039379,
"learning_rate": 5.617617467731438e-06,
"loss": 0.5112,
"step": 3925
},
{
"epoch": 2.042619542619543,
"grad_norm": 0.5384308493309783,
"learning_rate": 5.5904537960397155e-06,
"loss": 0.4975,
"step": 3930
},
{
"epoch": 2.0452182952182953,
"grad_norm": 0.5730691087208541,
"learning_rate": 5.563330454460179e-06,
"loss": 0.4961,
"step": 3935
},
{
"epoch": 2.047817047817048,
"grad_norm": 0.5477717438993087,
"learning_rate": 5.536247691065384e-06,
"loss": 0.5121,
"step": 3940
},
{
"epoch": 2.0504158004158004,
"grad_norm": 0.5548351105040114,
"learning_rate": 5.50920575355675e-06,
"loss": 0.5079,
"step": 3945
},
{
"epoch": 2.053014553014553,
"grad_norm": 0.5759237090673845,
"learning_rate": 5.482204889262319e-06,
"loss": 0.5093,
"step": 3950
},
{
"epoch": 2.0556133056133055,
"grad_norm": 0.5547272581679922,
"learning_rate": 5.455245345134449e-06,
"loss": 0.4965,
"step": 3955
},
{
"epoch": 2.0582120582120584,
"grad_norm": 0.5545227019423067,
"learning_rate": 5.428327367747598e-06,
"loss": 0.5056,
"step": 3960
},
{
"epoch": 2.060810810810811,
"grad_norm": 0.5475321378756351,
"learning_rate": 5.401451203296049e-06,
"loss": 0.4992,
"step": 3965
},
{
"epoch": 2.0634095634095635,
"grad_norm": 0.5571990780758471,
"learning_rate": 5.37461709759165e-06,
"loss": 0.5029,
"step": 3970
},
{
"epoch": 2.066008316008316,
"grad_norm": 0.5646750653448925,
"learning_rate": 5.3478252960615794e-06,
"loss": 0.5045,
"step": 3975
},
{
"epoch": 2.0686070686070686,
"grad_norm": 0.5748986438531573,
"learning_rate": 5.321076043746108e-06,
"loss": 0.4982,
"step": 3980
},
{
"epoch": 2.071205821205821,
"grad_norm": 0.5627371455320099,
"learning_rate": 5.2943695852963325e-06,
"loss": 0.5096,
"step": 3985
},
{
"epoch": 2.0738045738045736,
"grad_norm": 0.5535736181815755,
"learning_rate": 5.267706164971966e-06,
"loss": 0.502,
"step": 3990
},
{
"epoch": 2.0764033264033266,
"grad_norm": 0.5361674621317485,
"learning_rate": 5.241086026639079e-06,
"loss": 0.5056,
"step": 3995
},
{
"epoch": 2.079002079002079,
"grad_norm": 0.5644675385907009,
"learning_rate": 5.214509413767892e-06,
"loss": 0.5142,
"step": 4000
},
{
"epoch": 2.0816008316008316,
"grad_norm": 0.5784423395730652,
"learning_rate": 5.187976569430535e-06,
"loss": 0.5087,
"step": 4005
},
{
"epoch": 2.084199584199584,
"grad_norm": 0.592275009867849,
"learning_rate": 5.1614877362988205e-06,
"loss": 0.5027,
"step": 4010
},
{
"epoch": 2.0867983367983367,
"grad_norm": 0.5950969421446421,
"learning_rate": 5.1350431566420326e-06,
"loss": 0.5046,
"step": 4015
},
{
"epoch": 2.0893970893970892,
"grad_norm": 0.5446288597399254,
"learning_rate": 5.108643072324717e-06,
"loss": 0.5107,
"step": 4020
},
{
"epoch": 2.091995841995842,
"grad_norm": 0.5839095060604741,
"learning_rate": 5.082287724804453e-06,
"loss": 0.507,
"step": 4025
},
{
"epoch": 2.0945945945945947,
"grad_norm": 0.5801086689129009,
"learning_rate": 5.055977355129653e-06,
"loss": 0.5007,
"step": 4030
},
{
"epoch": 2.0971933471933473,
"grad_norm": 0.5394364509572592,
"learning_rate": 5.02971220393736e-06,
"loss": 0.5079,
"step": 4035
},
{
"epoch": 2.0997920997921,
"grad_norm": 0.5649634959442216,
"learning_rate": 5.003492511451051e-06,
"loss": 0.5042,
"step": 4040
},
{
"epoch": 2.1023908523908523,
"grad_norm": 0.5697263888969452,
"learning_rate": 4.977318517478421e-06,
"loss": 0.5012,
"step": 4045
},
{
"epoch": 2.104989604989605,
"grad_norm": 0.6166161420968725,
"learning_rate": 4.951190461409214e-06,
"loss": 0.511,
"step": 4050
},
{
"epoch": 2.1075883575883574,
"grad_norm": 0.5624903877149114,
"learning_rate": 4.925108582213013e-06,
"loss": 0.5104,
"step": 4055
},
{
"epoch": 2.1101871101871104,
"grad_norm": 0.5731533074752744,
"learning_rate": 4.899073118437063e-06,
"loss": 0.5109,
"step": 4060
},
{
"epoch": 2.112785862785863,
"grad_norm": 0.5800809144559984,
"learning_rate": 4.873084308204101e-06,
"loss": 0.4999,
"step": 4065
},
{
"epoch": 2.1153846153846154,
"grad_norm": 0.5639967552020521,
"learning_rate": 4.84714238921015e-06,
"loss": 0.4972,
"step": 4070
},
{
"epoch": 2.117983367983368,
"grad_norm": 0.5540959676849216,
"learning_rate": 4.821247598722373e-06,
"loss": 0.4887,
"step": 4075
},
{
"epoch": 2.1205821205821205,
"grad_norm": 0.5820289593717347,
"learning_rate": 4.7954001735768925e-06,
"loss": 0.4983,
"step": 4080
},
{
"epoch": 2.123180873180873,
"grad_norm": 0.5665889825124238,
"learning_rate": 4.7696003501766155e-06,
"loss": 0.4928,
"step": 4085
},
{
"epoch": 2.125779625779626,
"grad_norm": 0.5786397074647865,
"learning_rate": 4.7438483644890776e-06,
"loss": 0.509,
"step": 4090
},
{
"epoch": 2.1283783783783785,
"grad_norm": 0.5544771675503383,
"learning_rate": 4.718144452044299e-06,
"loss": 0.5088,
"step": 4095
},
{
"epoch": 2.130977130977131,
"grad_norm": 0.5842089375560309,
"learning_rate": 4.692488847932601e-06,
"loss": 0.5131,
"step": 4100
},
{
"epoch": 2.1335758835758836,
"grad_norm": 0.5572667236950973,
"learning_rate": 4.666881786802492e-06,
"loss": 0.513,
"step": 4105
},
{
"epoch": 2.136174636174636,
"grad_norm": 0.6261912246125306,
"learning_rate": 4.6413235028584804e-06,
"loss": 0.5053,
"step": 4110
},
{
"epoch": 2.1387733887733886,
"grad_norm": 0.5702222004267216,
"learning_rate": 4.615814229858969e-06,
"loss": 0.495,
"step": 4115
},
{
"epoch": 2.141372141372141,
"grad_norm": 0.5790166866248228,
"learning_rate": 4.590354201114103e-06,
"loss": 0.4973,
"step": 4120
},
{
"epoch": 2.143970893970894,
"grad_norm": 0.5603345931162405,
"learning_rate": 4.564943649483625e-06,
"loss": 0.5063,
"step": 4125
},
{
"epoch": 2.1465696465696467,
"grad_norm": 0.5569620723069888,
"learning_rate": 4.539582807374756e-06,
"loss": 0.4982,
"step": 4130
},
{
"epoch": 2.149168399168399,
"grad_norm": 0.6019143400672264,
"learning_rate": 4.514271906740082e-06,
"loss": 0.5116,
"step": 4135
},
{
"epoch": 2.1517671517671517,
"grad_norm": 0.5668221330685952,
"learning_rate": 4.489011179075408e-06,
"loss": 0.4989,
"step": 4140
},
{
"epoch": 2.1543659043659042,
"grad_norm": 0.5777522818500115,
"learning_rate": 4.46380085541765e-06,
"loss": 0.4866,
"step": 4145
},
{
"epoch": 2.156964656964657,
"grad_norm": 0.5577057043245417,
"learning_rate": 4.438641166342733e-06,
"loss": 0.5048,
"step": 4150
},
{
"epoch": 2.1595634095634098,
"grad_norm": 0.5811543313527234,
"learning_rate": 4.413532341963477e-06,
"loss": 0.5024,
"step": 4155
},
{
"epoch": 2.1621621621621623,
"grad_norm": 0.5901491166344425,
"learning_rate": 4.388474611927472e-06,
"loss": 0.4985,
"step": 4160
},
{
"epoch": 2.164760914760915,
"grad_norm": 0.5537573132486768,
"learning_rate": 4.363468205415014e-06,
"loss": 0.4956,
"step": 4165
},
{
"epoch": 2.1673596673596673,
"grad_norm": 0.5660909275231115,
"learning_rate": 4.338513351136977e-06,
"loss": 0.4928,
"step": 4170
},
{
"epoch": 2.16995841995842,
"grad_norm": 0.5991376447658537,
"learning_rate": 4.313610277332732e-06,
"loss": 0.499,
"step": 4175
},
{
"epoch": 2.1725571725571724,
"grad_norm": 0.6075244421550833,
"learning_rate": 4.288759211768072e-06,
"loss": 0.5033,
"step": 4180
},
{
"epoch": 2.1751559251559254,
"grad_norm": 0.5517113456938116,
"learning_rate": 4.263960381733106e-06,
"loss": 0.4951,
"step": 4185
},
{
"epoch": 2.177754677754678,
"grad_norm": 0.5677317519142169,
"learning_rate": 4.2392140140401996e-06,
"loss": 0.4978,
"step": 4190
},
{
"epoch": 2.1803534303534304,
"grad_norm": 0.569770318980704,
"learning_rate": 4.214520335021896e-06,
"loss": 0.4939,
"step": 4195
},
{
"epoch": 2.182952182952183,
"grad_norm": 0.6118856781558967,
"learning_rate": 4.189879570528831e-06,
"loss": 0.5069,
"step": 4200
},
{
"epoch": 2.1855509355509355,
"grad_norm": 0.606019484002795,
"learning_rate": 4.165291945927693e-06,
"loss": 0.5043,
"step": 4205
},
{
"epoch": 2.188149688149688,
"grad_norm": 0.5556726937928989,
"learning_rate": 4.140757686099137e-06,
"loss": 0.4868,
"step": 4210
},
{
"epoch": 2.1907484407484406,
"grad_norm": 0.5837653560310493,
"learning_rate": 4.116277015435743e-06,
"loss": 0.5015,
"step": 4215
},
{
"epoch": 2.1933471933471935,
"grad_norm": 0.588179392816627,
"learning_rate": 4.091850157839963e-06,
"loss": 0.503,
"step": 4220
},
{
"epoch": 2.195945945945946,
"grad_norm": 0.5994112141091228,
"learning_rate": 4.067477336722063e-06,
"loss": 0.5124,
"step": 4225
},
{
"epoch": 2.1985446985446986,
"grad_norm": 0.6035705377584152,
"learning_rate": 4.043158774998093e-06,
"loss": 0.5089,
"step": 4230
},
{
"epoch": 2.201143451143451,
"grad_norm": 0.5688655453887249,
"learning_rate": 4.01889469508784e-06,
"loss": 0.5043,
"step": 4235
},
{
"epoch": 2.2037422037422036,
"grad_norm": 0.5917248307889117,
"learning_rate": 3.994685318912794e-06,
"loss": 0.5163,
"step": 4240
},
{
"epoch": 2.206340956340956,
"grad_norm": 0.5394197139265716,
"learning_rate": 3.970530867894114e-06,
"loss": 0.5069,
"step": 4245
},
{
"epoch": 2.208939708939709,
"grad_norm": 0.576389590721846,
"learning_rate": 3.946431562950624e-06,
"loss": 0.5005,
"step": 4250
},
{
"epoch": 2.2115384615384617,
"grad_norm": 0.595056352282733,
"learning_rate": 3.922387624496762e-06,
"loss": 0.5043,
"step": 4255
},
{
"epoch": 2.214137214137214,
"grad_norm": 0.5572014241693316,
"learning_rate": 3.89839927244058e-06,
"loss": 0.5074,
"step": 4260
},
{
"epoch": 2.2167359667359667,
"grad_norm": 0.5659607082452609,
"learning_rate": 3.87446672618174e-06,
"loss": 0.5078,
"step": 4265
},
{
"epoch": 2.2193347193347193,
"grad_norm": 0.5659519225313114,
"learning_rate": 3.850590204609501e-06,
"loss": 0.5042,
"step": 4270
},
{
"epoch": 2.221933471933472,
"grad_norm": 0.5520449827126801,
"learning_rate": 3.826769926100699e-06,
"loss": 0.5049,
"step": 4275
},
{
"epoch": 2.2245322245322248,
"grad_norm": 0.9464101657600216,
"learning_rate": 3.803006108517786e-06,
"loss": 0.5049,
"step": 4280
},
{
"epoch": 2.2271309771309773,
"grad_norm": 0.5678141927512144,
"learning_rate": 3.7792989692068018e-06,
"loss": 0.5035,
"step": 4285
},
{
"epoch": 2.22972972972973,
"grad_norm": 0.5706457165389711,
"learning_rate": 3.755648724995404e-06,
"loss": 0.4968,
"step": 4290
},
{
"epoch": 2.2323284823284824,
"grad_norm": 0.6171952232229321,
"learning_rate": 3.732055592190893e-06,
"loss": 0.5082,
"step": 4295
},
{
"epoch": 2.234927234927235,
"grad_norm": 0.600087841592369,
"learning_rate": 3.7085197865782085e-06,
"loss": 0.5039,
"step": 4300
},
{
"epoch": 2.2375259875259874,
"grad_norm": 0.5625791681960587,
"learning_rate": 3.6850415234179805e-06,
"loss": 0.5041,
"step": 4305
},
{
"epoch": 2.24012474012474,
"grad_norm": 0.596631151098558,
"learning_rate": 3.661621017444551e-06,
"loss": 0.5013,
"step": 4310
},
{
"epoch": 2.242723492723493,
"grad_norm": 0.5324892002802353,
"learning_rate": 3.638258482863999e-06,
"loss": 0.4958,
"step": 4315
},
{
"epoch": 2.2453222453222454,
"grad_norm": 0.5720843807653389,
"learning_rate": 3.6149541333522053e-06,
"loss": 0.4994,
"step": 4320
},
{
"epoch": 2.247920997920998,
"grad_norm": 0.5550466004827054,
"learning_rate": 3.5917081820528765e-06,
"loss": 0.5066,
"step": 4325
},
{
"epoch": 2.2505197505197505,
"grad_norm": 0.5853024960860177,
"learning_rate": 3.568520841575601e-06,
"loss": 0.4984,
"step": 4330
},
{
"epoch": 2.253118503118503,
"grad_norm": 0.5591994418510899,
"learning_rate": 3.5453923239939192e-06,
"loss": 0.5057,
"step": 4335
},
{
"epoch": 2.2557172557172556,
"grad_norm": 0.595876343661196,
"learning_rate": 3.5223228408433564e-06,
"loss": 0.4978,
"step": 4340
},
{
"epoch": 2.258316008316008,
"grad_norm": 0.5689153116175016,
"learning_rate": 3.499312603119517e-06,
"loss": 0.5045,
"step": 4345
},
{
"epoch": 2.260914760914761,
"grad_norm": 0.617131890152672,
"learning_rate": 3.4763618212761376e-06,
"loss": 0.5068,
"step": 4350
},
{
"epoch": 2.2635135135135136,
"grad_norm": 0.5756546456394432,
"learning_rate": 3.453470705223162e-06,
"loss": 0.5006,
"step": 4355
},
{
"epoch": 2.266112266112266,
"grad_norm": 0.5904214264537652,
"learning_rate": 3.430639464324825e-06,
"loss": 0.509,
"step": 4360
},
{
"epoch": 2.2687110187110187,
"grad_norm": 0.5486092941094705,
"learning_rate": 3.407868307397747e-06,
"loss": 0.4956,
"step": 4365
},
{
"epoch": 2.271309771309771,
"grad_norm": 0.6341681884960043,
"learning_rate": 3.3851574427090028e-06,
"loss": 0.502,
"step": 4370
},
{
"epoch": 2.2739085239085237,
"grad_norm": 0.5816609147620979,
"learning_rate": 3.362507077974234e-06,
"loss": 0.5053,
"step": 4375
},
{
"epoch": 2.2765072765072767,
"grad_norm": 0.5992096025731823,
"learning_rate": 3.339917420355746e-06,
"loss": 0.4915,
"step": 4380
},
{
"epoch": 2.279106029106029,
"grad_norm": 0.5634477819700985,
"learning_rate": 3.3173886764606133e-06,
"loss": 0.5034,
"step": 4385
},
{
"epoch": 2.2817047817047817,
"grad_norm": 0.5407976728647481,
"learning_rate": 3.2949210523387786e-06,
"loss": 0.4999,
"step": 4390
},
{
"epoch": 2.2843035343035343,
"grad_norm": 0.562584471586657,
"learning_rate": 3.2725147534811885e-06,
"loss": 0.502,
"step": 4395
},
{
"epoch": 2.286902286902287,
"grad_norm": 0.5561376196303791,
"learning_rate": 3.250169984817897e-06,
"loss": 0.4996,
"step": 4400
},
{
"epoch": 2.2895010395010393,
"grad_norm": 0.5288577740017452,
"learning_rate": 3.2278869507161947e-06,
"loss": 0.4923,
"step": 4405
},
{
"epoch": 2.2920997920997923,
"grad_norm": 0.5564297800059832,
"learning_rate": 3.2056658549787513e-06,
"loss": 0.5004,
"step": 4410
},
{
"epoch": 2.294698544698545,
"grad_norm": 0.5807459489768877,
"learning_rate": 3.1835069008417307e-06,
"loss": 0.513,
"step": 4415
},
{
"epoch": 2.2972972972972974,
"grad_norm": 0.5698550653282723,
"learning_rate": 3.1614102909729547e-06,
"loss": 0.5017,
"step": 4420
},
{
"epoch": 2.29989604989605,
"grad_norm": 0.5484677379859523,
"learning_rate": 3.139376227470038e-06,
"loss": 0.4948,
"step": 4425
},
{
"epoch": 2.3024948024948024,
"grad_norm": 0.6034586450111454,
"learning_rate": 3.1174049118585303e-06,
"loss": 0.5057,
"step": 4430
},
{
"epoch": 2.305093555093555,
"grad_norm": 0.6209164341363942,
"learning_rate": 3.0954965450900963e-06,
"loss": 0.5013,
"step": 4435
},
{
"epoch": 2.3076923076923075,
"grad_norm": 0.608022280956626,
"learning_rate": 3.0736513275406565e-06,
"loss": 0.5007,
"step": 4440
},
{
"epoch": 2.3102910602910605,
"grad_norm": 0.5822912947800326,
"learning_rate": 3.0518694590085608e-06,
"loss": 0.4878,
"step": 4445
},
{
"epoch": 2.312889812889813,
"grad_norm": 0.6052596335796735,
"learning_rate": 3.0301511387127746e-06,
"loss": 0.5048,
"step": 4450
},
{
"epoch": 2.3154885654885655,
"grad_norm": 0.6109257960539891,
"learning_rate": 3.0084965652910314e-06,
"loss": 0.4979,
"step": 4455
},
{
"epoch": 2.318087318087318,
"grad_norm": 0.6001612610617809,
"learning_rate": 2.9869059367980402e-06,
"loss": 0.502,
"step": 4460
},
{
"epoch": 2.3206860706860706,
"grad_norm": 0.5748427683895482,
"learning_rate": 2.965379450703665e-06,
"loss": 0.4976,
"step": 4465
},
{
"epoch": 2.323284823284823,
"grad_norm": 0.6062682998551074,
"learning_rate": 2.943917303891107e-06,
"loss": 0.51,
"step": 4470
},
{
"epoch": 2.3258835758835756,
"grad_norm": 0.5866563557363672,
"learning_rate": 2.92251969265512e-06,
"loss": 0.5063,
"step": 4475
},
{
"epoch": 2.3284823284823286,
"grad_norm": 0.5512520483966091,
"learning_rate": 2.9011868127002153e-06,
"loss": 0.4934,
"step": 4480
},
{
"epoch": 2.331081081081081,
"grad_norm": 0.5427787073773119,
"learning_rate": 2.879918859138857e-06,
"loss": 0.4909,
"step": 4485
},
{
"epoch": 2.3336798336798337,
"grad_norm": 0.5807057669777462,
"learning_rate": 2.8587160264896873e-06,
"loss": 0.4955,
"step": 4490
},
{
"epoch": 2.336278586278586,
"grad_norm": 0.6094348111906394,
"learning_rate": 2.8375785086757533e-06,
"loss": 0.5028,
"step": 4495
},
{
"epoch": 2.3388773388773387,
"grad_norm": 0.569446343993791,
"learning_rate": 2.8165064990227255e-06,
"loss": 0.4966,
"step": 4500
},
{
"epoch": 2.3414760914760917,
"grad_norm": 0.546949602624272,
"learning_rate": 2.795500190257122e-06,
"loss": 0.5041,
"step": 4505
},
{
"epoch": 2.3440748440748442,
"grad_norm": 0.5841136870299933,
"learning_rate": 2.774559774504566e-06,
"loss": 0.5093,
"step": 4510
},
{
"epoch": 2.3466735966735968,
"grad_norm": 0.5894084600218413,
"learning_rate": 2.75368544328801e-06,
"loss": 0.5018,
"step": 4515
},
{
"epoch": 2.3492723492723493,
"grad_norm": 0.5849212705691518,
"learning_rate": 2.7328773875259905e-06,
"loss": 0.4983,
"step": 4520
},
{
"epoch": 2.351871101871102,
"grad_norm": 0.5594245456576148,
"learning_rate": 2.7121357975308893e-06,
"loss": 0.5116,
"step": 4525
},
{
"epoch": 2.3544698544698544,
"grad_norm": 0.5904437980074254,
"learning_rate": 2.691460863007178e-06,
"loss": 0.5046,
"step": 4530
},
{
"epoch": 2.357068607068607,
"grad_norm": 0.6092333364117684,
"learning_rate": 2.670852773049698e-06,
"loss": 0.492,
"step": 4535
},
{
"epoch": 2.35966735966736,
"grad_norm": 0.5406949036065258,
"learning_rate": 2.6503117161419246e-06,
"loss": 0.4966,
"step": 4540
},
{
"epoch": 2.3622661122661124,
"grad_norm": 0.6499059905714683,
"learning_rate": 2.6298378801542337e-06,
"loss": 0.4995,
"step": 4545
},
{
"epoch": 2.364864864864865,
"grad_norm": 0.5417621572559367,
"learning_rate": 2.6094314523422035e-06,
"loss": 0.4903,
"step": 4550
},
{
"epoch": 2.3674636174636174,
"grad_norm": 0.5832045594170597,
"learning_rate": 2.589092619344885e-06,
"loss": 0.4937,
"step": 4555
},
{
"epoch": 2.37006237006237,
"grad_norm": 0.570494106023411,
"learning_rate": 2.5688215671830975e-06,
"loss": 0.4967,
"step": 4560
},
{
"epoch": 2.3726611226611225,
"grad_norm": 0.5563324888807575,
"learning_rate": 2.54861848125774e-06,
"loss": 0.5039,
"step": 4565
},
{
"epoch": 2.375259875259875,
"grad_norm": 0.5891719757564269,
"learning_rate": 2.5284835463480774e-06,
"loss": 0.5009,
"step": 4570
},
{
"epoch": 2.377858627858628,
"grad_norm": 0.6000418457824788,
"learning_rate": 2.5084169466100626e-06,
"loss": 0.494,
"step": 4575
},
{
"epoch": 2.3804573804573805,
"grad_norm": 0.5612803989317922,
"learning_rate": 2.4884188655746554e-06,
"loss": 0.4974,
"step": 4580
},
{
"epoch": 2.383056133056133,
"grad_norm": 0.5574484874125388,
"learning_rate": 2.468489486146125e-06,
"loss": 0.4953,
"step": 4585
},
{
"epoch": 2.3856548856548856,
"grad_norm": 0.550628523258081,
"learning_rate": 2.4486289906003935e-06,
"loss": 0.5182,
"step": 4590
},
{
"epoch": 2.388253638253638,
"grad_norm": 0.567017209479145,
"learning_rate": 2.4288375605833726e-06,
"loss": 0.4907,
"step": 4595
},
{
"epoch": 2.390852390852391,
"grad_norm": 0.5474114054711359,
"learning_rate": 2.4091153771092847e-06,
"loss": 0.4976,
"step": 4600
},
{
"epoch": 2.3934511434511436,
"grad_norm": 0.5567614559206484,
"learning_rate": 2.3894626205590177e-06,
"loss": 0.4925,
"step": 4605
},
{
"epoch": 2.396049896049896,
"grad_norm": 0.5620691248378288,
"learning_rate": 2.36987947067848e-06,
"loss": 0.4892,
"step": 4610
},
{
"epoch": 2.3986486486486487,
"grad_norm": 0.5471599595016963,
"learning_rate": 2.3503661065769523e-06,
"loss": 0.5006,
"step": 4615
},
{
"epoch": 2.401247401247401,
"grad_norm": 0.5643679588409989,
"learning_rate": 2.330922706725437e-06,
"loss": 0.5052,
"step": 4620
},
{
"epoch": 2.4038461538461537,
"grad_norm": 0.5992107723526578,
"learning_rate": 2.3115494489550517e-06,
"loss": 0.4944,
"step": 4625
},
{
"epoch": 2.4064449064449063,
"grad_norm": 0.5735681525239322,
"learning_rate": 2.292246510455375e-06,
"loss": 0.5023,
"step": 4630
},
{
"epoch": 2.4090436590436592,
"grad_norm": 0.5569413415577497,
"learning_rate": 2.2730140677728485e-06,
"loss": 0.5017,
"step": 4635
},
{
"epoch": 2.4116424116424118,
"grad_norm": 0.5657509769713301,
"learning_rate": 2.253852296809148e-06,
"loss": 0.5018,
"step": 4640
},
{
"epoch": 2.4142411642411643,
"grad_norm": 0.561092028484337,
"learning_rate": 2.234761372819577e-06,
"loss": 0.5005,
"step": 4645
},
{
"epoch": 2.416839916839917,
"grad_norm": 0.584135442702734,
"learning_rate": 2.215741470411472e-06,
"loss": 0.495,
"step": 4650
},
{
"epoch": 2.4194386694386694,
"grad_norm": 0.5585660724073979,
"learning_rate": 2.196792763542599e-06,
"loss": 0.5045,
"step": 4655
},
{
"epoch": 2.422037422037422,
"grad_norm": 0.5584867361238677,
"learning_rate": 2.1779154255195576e-06,
"loss": 0.5018,
"step": 4660
},
{
"epoch": 2.4246361746361744,
"grad_norm": 0.566982522139209,
"learning_rate": 2.1591096289962077e-06,
"loss": 0.4911,
"step": 4665
},
{
"epoch": 2.4272349272349274,
"grad_norm": 0.560220035509712,
"learning_rate": 2.140375545972081e-06,
"loss": 0.5021,
"step": 4670
},
{
"epoch": 2.42983367983368,
"grad_norm": 0.5507532159687185,
"learning_rate": 2.121713347790808e-06,
"loss": 0.5036,
"step": 4675
},
{
"epoch": 2.4324324324324325,
"grad_norm": 0.5966472596819247,
"learning_rate": 2.1031232051385606e-06,
"loss": 0.4966,
"step": 4680
},
{
"epoch": 2.435031185031185,
"grad_norm": 0.5544285219883713,
"learning_rate": 2.0846052880424783e-06,
"loss": 0.501,
"step": 4685
},
{
"epoch": 2.4376299376299375,
"grad_norm": 0.5182057167941686,
"learning_rate": 2.0661597658691226e-06,
"loss": 0.4904,
"step": 4690
},
{
"epoch": 2.44022869022869,
"grad_norm": 0.5694712994846337,
"learning_rate": 2.047786807322927e-06,
"loss": 0.4875,
"step": 4695
},
{
"epoch": 2.442827442827443,
"grad_norm": 0.5644710268706207,
"learning_rate": 2.029486580444644e-06,
"loss": 0.4919,
"step": 4700
},
{
"epoch": 2.4454261954261955,
"grad_norm": 0.5709123415197537,
"learning_rate": 2.0112592526098173e-06,
"loss": 0.5087,
"step": 4705
},
{
"epoch": 2.448024948024948,
"grad_norm": 0.5535461175978135,
"learning_rate": 1.993104990527257e-06,
"loss": 0.4921,
"step": 4710
},
{
"epoch": 2.4506237006237006,
"grad_norm": 0.6301006821974645,
"learning_rate": 1.975023960237499e-06,
"loss": 0.4885,
"step": 4715
},
{
"epoch": 2.453222453222453,
"grad_norm": 0.5494288053608467,
"learning_rate": 1.957016327111294e-06,
"loss": 0.4906,
"step": 4720
},
{
"epoch": 2.4558212058212057,
"grad_norm": 0.5493564158683376,
"learning_rate": 1.9390822558481014e-06,
"loss": 0.4955,
"step": 4725
},
{
"epoch": 2.4584199584199586,
"grad_norm": 0.6043307426388902,
"learning_rate": 1.921221910474579e-06,
"loss": 0.5007,
"step": 4730
},
{
"epoch": 2.461018711018711,
"grad_norm": 0.5909390472872661,
"learning_rate": 1.9034354543430677e-06,
"loss": 0.5009,
"step": 4735
},
{
"epoch": 2.4636174636174637,
"grad_norm": 0.5601290888435961,
"learning_rate": 1.885723050130127e-06,
"loss": 0.4869,
"step": 4740
},
{
"epoch": 2.4662162162162162,
"grad_norm": 0.5671638427007798,
"learning_rate": 1.8680848598350165e-06,
"loss": 0.5002,
"step": 4745
},
{
"epoch": 2.4688149688149688,
"grad_norm": 0.5597631082866084,
"learning_rate": 1.8505210447782418e-06,
"loss": 0.5092,
"step": 4750
},
{
"epoch": 2.4714137214137213,
"grad_norm": 0.5611497450799863,
"learning_rate": 1.833031765600054e-06,
"loss": 0.5008,
"step": 4755
},
{
"epoch": 2.474012474012474,
"grad_norm": 0.5601559085266762,
"learning_rate": 1.8156171822589963e-06,
"loss": 0.4887,
"step": 4760
},
{
"epoch": 2.476611226611227,
"grad_norm": 0.555263493680061,
"learning_rate": 1.7982774540304404e-06,
"loss": 0.5112,
"step": 4765
},
{
"epoch": 2.4792099792099793,
"grad_norm": 0.5663743347641695,
"learning_rate": 1.781012739505127e-06,
"loss": 0.4907,
"step": 4770
},
{
"epoch": 2.481808731808732,
"grad_norm": 0.6155955922535356,
"learning_rate": 1.7638231965877039e-06,
"loss": 0.4836,
"step": 4775
},
{
"epoch": 2.4844074844074844,
"grad_norm": 0.5902555495646782,
"learning_rate": 1.7467089824953077e-06,
"loss": 0.5047,
"step": 4780
},
{
"epoch": 2.487006237006237,
"grad_norm": 0.5720398120641105,
"learning_rate": 1.7296702537560994e-06,
"loss": 0.5094,
"step": 4785
},
{
"epoch": 2.4896049896049894,
"grad_norm": 0.5593330846808308,
"learning_rate": 1.7127071662078455e-06,
"loss": 0.5121,
"step": 4790
},
{
"epoch": 2.492203742203742,
"grad_norm": 0.5807674813382018,
"learning_rate": 1.6958198749964983e-06,
"loss": 0.4888,
"step": 4795
},
{
"epoch": 2.494802494802495,
"grad_norm": 0.5712031491060828,
"learning_rate": 1.679008534574761e-06,
"loss": 0.485,
"step": 4800
},
{
"epoch": 2.4974012474012475,
"grad_norm": 0.5616832705475885,
"learning_rate": 1.6622732987006884e-06,
"loss": 0.5019,
"step": 4805
},
{
"epoch": 2.5,
"grad_norm": 0.5536996546706574,
"learning_rate": 1.6456143204362807e-06,
"loss": 0.4933,
"step": 4810
},
{
"epoch": 2.5025987525987525,
"grad_norm": 0.5707614937226522,
"learning_rate": 1.6290317521460697e-06,
"loss": 0.4828,
"step": 4815
},
{
"epoch": 2.505197505197505,
"grad_norm": 0.5927994991308208,
"learning_rate": 1.6125257454957365e-06,
"loss": 0.4861,
"step": 4820
},
{
"epoch": 2.507796257796258,
"grad_norm": 0.5852679815841081,
"learning_rate": 1.5960964514507316e-06,
"loss": 0.4944,
"step": 4825
},
{
"epoch": 2.51039501039501,
"grad_norm": 0.5782206531686512,
"learning_rate": 1.5797440202748748e-06,
"loss": 0.4897,
"step": 4830
},
{
"epoch": 2.512993762993763,
"grad_norm": 0.5749262146519877,
"learning_rate": 1.5634686015289925e-06,
"loss": 0.5008,
"step": 4835
},
{
"epoch": 2.5155925155925156,
"grad_norm": 0.5902984761192304,
"learning_rate": 1.5472703440695524e-06,
"loss": 0.4997,
"step": 4840
},
{
"epoch": 2.518191268191268,
"grad_norm": 0.5725171354203544,
"learning_rate": 1.5311493960472978e-06,
"loss": 0.4913,
"step": 4845
},
{
"epoch": 2.5207900207900207,
"grad_norm": 0.5489936561056176,
"learning_rate": 1.5151059049058913e-06,
"loss": 0.4965,
"step": 4850
},
{
"epoch": 2.523388773388773,
"grad_norm": 0.571188356733997,
"learning_rate": 1.499140017380566e-06,
"loss": 0.4955,
"step": 4855
},
{
"epoch": 2.525987525987526,
"grad_norm": 0.543934249979962,
"learning_rate": 1.4832518794967853e-06,
"loss": 0.498,
"step": 4860
},
{
"epoch": 2.5285862785862787,
"grad_norm": 0.5779586017866482,
"learning_rate": 1.4674416365689137e-06,
"loss": 0.5079,
"step": 4865
},
{
"epoch": 2.5311850311850312,
"grad_norm": 0.573790412674796,
"learning_rate": 1.4517094331988734e-06,
"loss": 0.5071,
"step": 4870
},
{
"epoch": 2.5337837837837838,
"grad_norm": 0.5834488347165243,
"learning_rate": 1.4360554132748305e-06,
"loss": 0.493,
"step": 4875
},
{
"epoch": 2.5363825363825363,
"grad_norm": 0.5521193528499587,
"learning_rate": 1.4204797199698839e-06,
"loss": 0.4893,
"step": 4880
},
{
"epoch": 2.538981288981289,
"grad_norm": 0.5837955107251298,
"learning_rate": 1.4049824957407464e-06,
"loss": 0.4998,
"step": 4885
},
{
"epoch": 2.5415800415800414,
"grad_norm": 0.5654955515661542,
"learning_rate": 1.3895638823264447e-06,
"loss": 0.4913,
"step": 4890
},
{
"epoch": 2.5441787941787943,
"grad_norm": 0.5743033149419415,
"learning_rate": 1.374224020747027e-06,
"loss": 0.5056,
"step": 4895
},
{
"epoch": 2.546777546777547,
"grad_norm": 0.5855881014618302,
"learning_rate": 1.3589630513022656e-06,
"loss": 0.5028,
"step": 4900
},
{
"epoch": 2.5493762993762994,
"grad_norm": 0.6031010192364838,
"learning_rate": 1.3437811135703792e-06,
"loss": 0.4964,
"step": 4905
},
{
"epoch": 2.551975051975052,
"grad_norm": 0.5409820031001269,
"learning_rate": 1.328678346406761e-06,
"loss": 0.4946,
"step": 4910
},
{
"epoch": 2.5545738045738045,
"grad_norm": 0.5667214248558752,
"learning_rate": 1.3136548879426926e-06,
"loss": 0.492,
"step": 4915
},
{
"epoch": 2.5571725571725574,
"grad_norm": 0.5478082998559753,
"learning_rate": 1.2987108755840994e-06,
"loss": 0.4949,
"step": 4920
},
{
"epoch": 2.5597713097713095,
"grad_norm": 0.5748275704846928,
"learning_rate": 1.2838464460102862e-06,
"loss": 0.4969,
"step": 4925
},
{
"epoch": 2.5623700623700625,
"grad_norm": 0.5561105303734099,
"learning_rate": 1.2690617351726798e-06,
"loss": 0.4967,
"step": 4930
},
{
"epoch": 2.564968814968815,
"grad_norm": 0.5847660828596739,
"learning_rate": 1.2543568782935933e-06,
"loss": 0.4893,
"step": 4935
},
{
"epoch": 2.5675675675675675,
"grad_norm": 0.5797822737989639,
"learning_rate": 1.2397320098649957e-06,
"loss": 0.5002,
"step": 4940
},
{
"epoch": 2.57016632016632,
"grad_norm": 0.5696211912101424,
"learning_rate": 1.225187263647265e-06,
"loss": 0.5056,
"step": 4945
},
{
"epoch": 2.5727650727650726,
"grad_norm": 0.6105509252737591,
"learning_rate": 1.210722772667977e-06,
"loss": 0.4786,
"step": 4950
},
{
"epoch": 2.5753638253638256,
"grad_norm": 0.5710521831184937,
"learning_rate": 1.196338669220689e-06,
"loss": 0.4895,
"step": 4955
},
{
"epoch": 2.577962577962578,
"grad_norm": 0.5601653499624455,
"learning_rate": 1.182035084863724e-06,
"loss": 0.5016,
"step": 4960
},
{
"epoch": 2.5805613305613306,
"grad_norm": 0.584343919584128,
"learning_rate": 1.167812150418972e-06,
"loss": 0.5159,
"step": 4965
},
{
"epoch": 2.583160083160083,
"grad_norm": 0.6129296779221889,
"learning_rate": 1.1536699959706898e-06,
"loss": 0.5055,
"step": 4970
},
{
"epoch": 2.5857588357588357,
"grad_norm": 0.5844416957330778,
"learning_rate": 1.1396087508643106e-06,
"loss": 0.504,
"step": 4975
},
{
"epoch": 2.5883575883575882,
"grad_norm": 0.5750509882184978,
"learning_rate": 1.1256285437052684e-06,
"loss": 0.4925,
"step": 4980
},
{
"epoch": 2.5909563409563408,
"grad_norm": 0.5486207493135079,
"learning_rate": 1.1117295023578134e-06,
"loss": 0.5079,
"step": 4985
},
{
"epoch": 2.5935550935550937,
"grad_norm": 0.5751013655385914,
"learning_rate": 1.0979117539438444e-06,
"loss": 0.4925,
"step": 4990
},
{
"epoch": 2.5961538461538463,
"grad_norm": 0.5940615821819871,
"learning_rate": 1.0841754248417535e-06,
"loss": 0.5001,
"step": 4995
},
{
"epoch": 2.598752598752599,
"grad_norm": 0.544341637873671,
"learning_rate": 1.0705206406852607e-06,
"loss": 0.5003,
"step": 5000
},
{
"epoch": 2.6013513513513513,
"grad_norm": 0.5688429085624325,
"learning_rate": 1.0569475263622652e-06,
"loss": 0.492,
"step": 5005
},
{
"epoch": 2.603950103950104,
"grad_norm": 0.5898916948570275,
"learning_rate": 1.0434562060137154e-06,
"loss": 0.494,
"step": 5010
},
{
"epoch": 2.606548856548857,
"grad_norm": 0.5415866533855809,
"learning_rate": 1.030046803032455e-06,
"loss": 0.4904,
"step": 5015
},
{
"epoch": 2.609147609147609,
"grad_norm": 0.5719708739269925,
"learning_rate": 1.0167194400621072e-06,
"loss": 0.489,
"step": 5020
},
{
"epoch": 2.611746361746362,
"grad_norm": 0.5958211082907041,
"learning_rate": 1.003474238995954e-06,
"loss": 0.4957,
"step": 5025
},
{
"epoch": 2.6143451143451144,
"grad_norm": 0.5541558159414748,
"learning_rate": 9.903113209758098e-07,
"loss": 0.4993,
"step": 5030
},
{
"epoch": 2.616943866943867,
"grad_norm": 0.5872960411024288,
"learning_rate": 9.772308063909263e-07,
"loss": 0.5105,
"step": 5035
},
{
"epoch": 2.6195426195426195,
"grad_norm": 0.5605189291423912,
"learning_rate": 9.642328148768865e-07,
"loss": 0.4963,
"step": 5040
},
{
"epoch": 2.622141372141372,
"grad_norm": 0.5607585059207263,
"learning_rate": 9.513174653145052e-07,
"loss": 0.5028,
"step": 5045
},
{
"epoch": 2.624740124740125,
"grad_norm": 0.5534539551699679,
"learning_rate": 9.384848758287469e-07,
"loss": 0.4894,
"step": 5050
},
{
"epoch": 2.6273388773388775,
"grad_norm": 0.5668591204471706,
"learning_rate": 9.25735163787651e-07,
"loss": 0.5004,
"step": 5055
},
{
"epoch": 2.62993762993763,
"grad_norm": 0.5428488027300683,
"learning_rate": 9.13068445801244e-07,
"loss": 0.5028,
"step": 5060
},
{
"epoch": 2.6325363825363826,
"grad_norm": 0.5730809692935364,
"learning_rate": 9.004848377204878e-07,
"loss": 0.4961,
"step": 5065
},
{
"epoch": 2.635135135135135,
"grad_norm": 0.5287247837670602,
"learning_rate": 8.879844546362093e-07,
"loss": 0.499,
"step": 5070
},
{
"epoch": 2.6377338877338876,
"grad_norm": 0.5730710423541429,
"learning_rate": 8.755674108780532e-07,
"loss": 0.4964,
"step": 5075
},
{
"epoch": 2.64033264033264,
"grad_norm": 0.5506264010952827,
"learning_rate": 8.632338200134382e-07,
"loss": 0.4936,
"step": 5080
},
{
"epoch": 2.642931392931393,
"grad_norm": 0.519210055930055,
"learning_rate": 8.509837948465094e-07,
"loss": 0.49,
"step": 5085
},
{
"epoch": 2.6455301455301456,
"grad_norm": 0.5809039207230658,
"learning_rate": 8.388174474171163e-07,
"loss": 0.5033,
"step": 5090
},
{
"epoch": 2.648128898128898,
"grad_norm": 0.561018096449011,
"learning_rate": 8.267348889997839e-07,
"loss": 0.5051,
"step": 5095
},
{
"epoch": 2.6507276507276507,
"grad_norm": 0.5422624927838648,
"learning_rate": 8.14736230102694e-07,
"loss": 0.4864,
"step": 5100
},
{
"epoch": 2.6533264033264032,
"grad_norm": 0.549824075528394,
"learning_rate": 8.028215804666761e-07,
"loss": 0.5027,
"step": 5105
},
{
"epoch": 2.6559251559251558,
"grad_norm": 0.6073556177013598,
"learning_rate": 7.909910490642025e-07,
"loss": 0.4981,
"step": 5110
},
{
"epoch": 2.6585239085239083,
"grad_norm": 0.5812550130344551,
"learning_rate": 7.792447440983985e-07,
"loss": 0.504,
"step": 5115
},
{
"epoch": 2.6611226611226613,
"grad_norm": 0.5688133990130678,
"learning_rate": 7.675827730020358e-07,
"loss": 0.5004,
"step": 5120
},
{
"epoch": 2.663721413721414,
"grad_norm": 0.5617035595950866,
"learning_rate": 7.560052424365716e-07,
"loss": 0.4923,
"step": 5125
},
{
"epoch": 2.6663201663201663,
"grad_norm": 0.5835381005107588,
"learning_rate": 7.445122582911546e-07,
"loss": 0.4989,
"step": 5130
},
{
"epoch": 2.668918918918919,
"grad_norm": 0.5681826093882452,
"learning_rate": 7.331039256816664e-07,
"loss": 0.5001,
"step": 5135
},
{
"epoch": 2.6715176715176714,
"grad_norm": 0.5416547579730493,
"learning_rate": 7.217803489497621e-07,
"loss": 0.4915,
"step": 5140
},
{
"epoch": 2.6741164241164244,
"grad_norm": 0.5708948503374369,
"learning_rate": 7.10541631661904e-07,
"loss": 0.506,
"step": 5145
},
{
"epoch": 2.6767151767151764,
"grad_norm": 0.5825889025850369,
"learning_rate": 6.993878766084295e-07,
"loss": 0.4978,
"step": 5150
},
{
"epoch": 2.6793139293139294,
"grad_norm": 0.5658766831235557,
"learning_rate": 6.883191858026006e-07,
"loss": 0.5002,
"step": 5155
},
{
"epoch": 2.681912681912682,
"grad_norm": 0.5560529305298989,
"learning_rate": 6.773356604796744e-07,
"loss": 0.4975,
"step": 5160
},
{
"epoch": 2.6845114345114345,
"grad_norm": 0.5708450386103079,
"learning_rate": 6.664374010959739e-07,
"loss": 0.5089,
"step": 5165
},
{
"epoch": 2.687110187110187,
"grad_norm": 0.5562285971352838,
"learning_rate": 6.556245073279777e-07,
"loss": 0.5075,
"step": 5170
},
{
"epoch": 2.6897089397089395,
"grad_norm": 0.5721605688382857,
"learning_rate": 6.448970780713948e-07,
"loss": 0.4876,
"step": 5175
},
{
"epoch": 2.6923076923076925,
"grad_norm": 0.567860262795361,
"learning_rate": 6.342552114402789e-07,
"loss": 0.4968,
"step": 5180
},
{
"epoch": 2.694906444906445,
"grad_norm": 0.5512124172540173,
"learning_rate": 6.236990047661074e-07,
"loss": 0.4971,
"step": 5185
},
{
"epoch": 2.6975051975051976,
"grad_norm": 0.5567955011645962,
"learning_rate": 6.132285545969141e-07,
"loss": 0.4893,
"step": 5190
},
{
"epoch": 2.70010395010395,
"grad_norm": 0.5658501671925406,
"learning_rate": 6.028439566963929e-07,
"loss": 0.4899,
"step": 5195
},
{
"epoch": 2.7027027027027026,
"grad_norm": 0.5444634054315433,
"learning_rate": 5.925453060430219e-07,
"loss": 0.4878,
"step": 5200
},
{
"epoch": 2.705301455301455,
"grad_norm": 0.550663548661878,
"learning_rate": 5.823326968292009e-07,
"loss": 0.5009,
"step": 5205
},
{
"epoch": 2.7079002079002077,
"grad_norm": 0.6060107034007801,
"learning_rate": 5.722062224603886e-07,
"loss": 0.4946,
"step": 5210
},
{
"epoch": 2.7104989604989607,
"grad_norm": 0.58216821945967,
"learning_rate": 5.621659755542408e-07,
"loss": 0.5057,
"step": 5215
},
{
"epoch": 2.713097713097713,
"grad_norm": 0.5416674185051638,
"learning_rate": 5.522120479397731e-07,
"loss": 0.4965,
"step": 5220
},
{
"epoch": 2.7156964656964657,
"grad_norm": 0.5761995130950316,
"learning_rate": 5.423445306565168e-07,
"loss": 0.5038,
"step": 5225
},
{
"epoch": 2.7182952182952183,
"grad_norm": 0.5635042371421582,
"learning_rate": 5.325635139536867e-07,
"loss": 0.4884,
"step": 5230
},
{
"epoch": 2.720893970893971,
"grad_norm": 0.5743033588993577,
"learning_rate": 5.228690872893527e-07,
"loss": 0.4934,
"step": 5235
},
{
"epoch": 2.7234927234927238,
"grad_norm": 0.5431291593888027,
"learning_rate": 5.132613393296293e-07,
"loss": 0.4921,
"step": 5240
},
{
"epoch": 2.726091476091476,
"grad_norm": 0.5702390465003064,
"learning_rate": 5.037403579478551e-07,
"loss": 0.5067,
"step": 5245
},
{
"epoch": 2.728690228690229,
"grad_norm": 0.5864949506182338,
"learning_rate": 4.943062302237922e-07,
"loss": 0.5047,
"step": 5250
},
{
"epoch": 2.7312889812889813,
"grad_norm": 0.5856655357457804,
"learning_rate": 4.849590424428386e-07,
"loss": 0.498,
"step": 5255
},
{
"epoch": 2.733887733887734,
"grad_norm": 0.5592585991123705,
"learning_rate": 4.7569888009522336e-07,
"loss": 0.5062,
"step": 5260
},
{
"epoch": 2.7364864864864864,
"grad_norm": 0.5598975614142522,
"learning_rate": 4.665258278752383e-07,
"loss": 0.4922,
"step": 5265
},
{
"epoch": 2.739085239085239,
"grad_norm": 0.5672657648159654,
"learning_rate": 4.574399696804588e-07,
"loss": 0.5032,
"step": 5270
},
{
"epoch": 2.741683991683992,
"grad_norm": 0.5635739403935113,
"learning_rate": 4.4844138861096954e-07,
"loss": 0.4914,
"step": 5275
},
{
"epoch": 2.7442827442827444,
"grad_norm": 0.5707341919153839,
"learning_rate": 4.3953016696861805e-07,
"loss": 0.4955,
"step": 5280
},
{
"epoch": 2.746881496881497,
"grad_norm": 0.593682622033041,
"learning_rate": 4.3070638625624884e-07,
"loss": 0.504,
"step": 5285
},
{
"epoch": 2.7494802494802495,
"grad_norm": 0.5308025960734446,
"learning_rate": 4.2197012717696604e-07,
"loss": 0.4898,
"step": 5290
},
{
"epoch": 2.752079002079002,
"grad_norm": 0.562474477928597,
"learning_rate": 4.133214696333943e-07,
"loss": 0.4919,
"step": 5295
},
{
"epoch": 2.7546777546777546,
"grad_norm": 0.5626677048136434,
"learning_rate": 4.047604927269433e-07,
"loss": 0.5041,
"step": 5300
},
{
"epoch": 2.757276507276507,
"grad_norm": 0.5534559012860586,
"learning_rate": 3.9628727475709003e-07,
"loss": 0.5018,
"step": 5305
},
{
"epoch": 2.75987525987526,
"grad_norm": 0.5678002530448841,
"learning_rate": 3.879018932206624e-07,
"loss": 0.4795,
"step": 5310
},
{
"epoch": 2.7624740124740126,
"grad_norm": 0.5725188711773384,
"learning_rate": 3.796044248111219e-07,
"loss": 0.4825,
"step": 5315
},
{
"epoch": 2.765072765072765,
"grad_norm": 0.5449981795766418,
"learning_rate": 3.7139494541787225e-07,
"loss": 0.4966,
"step": 5320
},
{
"epoch": 2.7676715176715176,
"grad_norm": 0.5793024671746052,
"learning_rate": 3.632735301255652e-07,
"loss": 0.499,
"step": 5325
},
{
"epoch": 2.77027027027027,
"grad_norm": 0.5463699816730897,
"learning_rate": 3.552402532134014e-07,
"loss": 0.4971,
"step": 5330
},
{
"epoch": 2.7728690228690227,
"grad_norm": 0.569435124360503,
"learning_rate": 3.472951881544695e-07,
"loss": 0.4965,
"step": 5335
},
{
"epoch": 2.7754677754677752,
"grad_norm": 0.5426897702433433,
"learning_rate": 3.3943840761505695e-07,
"loss": 0.5109,
"step": 5340
},
{
"epoch": 2.778066528066528,
"grad_norm": 0.5583873172184759,
"learning_rate": 3.316699834539983e-07,
"loss": 0.5025,
"step": 5345
},
{
"epoch": 2.7806652806652807,
"grad_norm": 0.589354529655944,
"learning_rate": 3.239899867220064e-07,
"loss": 0.4998,
"step": 5350
},
{
"epoch": 2.7832640332640333,
"grad_norm": 0.5254097934455335,
"learning_rate": 3.163984876610371e-07,
"loss": 0.4949,
"step": 5355
},
{
"epoch": 2.785862785862786,
"grad_norm": 0.5536550483370661,
"learning_rate": 3.0889555570363216e-07,
"loss": 0.4917,
"step": 5360
},
{
"epoch": 2.7884615384615383,
"grad_norm": 0.5994721180940217,
"learning_rate": 3.0148125947229047e-07,
"loss": 0.495,
"step": 5365
},
{
"epoch": 2.7910602910602913,
"grad_norm": 0.5652212087251041,
"learning_rate": 2.9415566677884365e-07,
"loss": 0.5029,
"step": 5370
},
{
"epoch": 2.7936590436590434,
"grad_norm": 0.567605739930232,
"learning_rate": 2.869188446238336e-07,
"loss": 0.506,
"step": 5375
},
{
"epoch": 2.7962577962577964,
"grad_norm": 0.5569576384780233,
"learning_rate": 2.7977085919589253e-07,
"loss": 0.5003,
"step": 5380
},
{
"epoch": 2.798856548856549,
"grad_norm": 0.5412543330665912,
"learning_rate": 2.727117758711506e-07,
"loss": 0.4887,
"step": 5385
},
{
"epoch": 2.8014553014553014,
"grad_norm": 0.5376966982466084,
"learning_rate": 2.6574165921262605e-07,
"loss": 0.4888,
"step": 5390
},
{
"epoch": 2.804054054054054,
"grad_norm": 0.54053951299071,
"learning_rate": 2.588605729696447e-07,
"loss": 0.4919,
"step": 5395
},
{
"epoch": 2.8066528066528065,
"grad_norm": 0.5981753062988322,
"learning_rate": 2.5206858007724934e-07,
"loss": 0.4839,
"step": 5400
},
{
"epoch": 2.8092515592515594,
"grad_norm": 0.5725431316908658,
"learning_rate": 2.453657426556244e-07,
"loss": 0.5122,
"step": 5405
},
{
"epoch": 2.811850311850312,
"grad_norm": 0.5422874879244404,
"learning_rate": 2.387521220095357e-07,
"loss": 0.4891,
"step": 5410
},
{
"epoch": 2.8144490644490645,
"grad_norm": 0.5599975123926269,
"learning_rate": 2.3222777862776046e-07,
"loss": 0.5021,
"step": 5415
},
{
"epoch": 2.817047817047817,
"grad_norm": 0.5590054648939673,
"learning_rate": 2.2579277218253926e-07,
"loss": 0.4841,
"step": 5420
},
{
"epoch": 2.8196465696465696,
"grad_norm": 0.5504364503745305,
"learning_rate": 2.1944716152902834e-07,
"loss": 0.5002,
"step": 5425
},
{
"epoch": 2.822245322245322,
"grad_norm": 0.5797812317339487,
"learning_rate": 2.131910047047625e-07,
"loss": 0.486,
"step": 5430
},
{
"epoch": 2.8248440748440746,
"grad_norm": 0.5584561563327431,
"learning_rate": 2.070243589291221e-07,
"loss": 0.4879,
"step": 5435
},
{
"epoch": 2.8274428274428276,
"grad_norm": 0.5983591006728118,
"learning_rate": 2.0094728060281454e-07,
"loss": 0.4964,
"step": 5440
},
{
"epoch": 2.83004158004158,
"grad_norm": 0.5793345159708853,
"learning_rate": 1.9495982530735035e-07,
"loss": 0.4931,
"step": 5445
},
{
"epoch": 2.8326403326403327,
"grad_norm": 0.5521618537806441,
"learning_rate": 1.890620478045435e-07,
"loss": 0.4844,
"step": 5450
},
{
"epoch": 2.835239085239085,
"grad_norm": 0.5590622081741721,
"learning_rate": 1.832540020360063e-07,
"loss": 0.4941,
"step": 5455
},
{
"epoch": 2.8378378378378377,
"grad_norm": 0.5807487335701172,
"learning_rate": 1.7753574112265526e-07,
"loss": 0.4888,
"step": 5460
},
{
"epoch": 2.8404365904365907,
"grad_norm": 0.5896137562543345,
"learning_rate": 1.7190731736422606e-07,
"loss": 0.4983,
"step": 5465
},
{
"epoch": 2.8430353430353428,
"grad_norm": 0.5740425049161126,
"learning_rate": 1.6636878223879826e-07,
"loss": 0.4931,
"step": 5470
},
{
"epoch": 2.8456340956340958,
"grad_norm": 0.5440884316639071,
"learning_rate": 1.6092018640231688e-07,
"loss": 0.4831,
"step": 5475
},
{
"epoch": 2.8482328482328483,
"grad_norm": 0.5781760225759857,
"learning_rate": 1.5556157968813823e-07,
"loss": 0.4988,
"step": 5480
},
{
"epoch": 2.850831600831601,
"grad_norm": 0.5889448479356277,
"learning_rate": 1.5029301110656923e-07,
"loss": 0.4885,
"step": 5485
},
{
"epoch": 2.8534303534303533,
"grad_norm": 0.6111223279448279,
"learning_rate": 1.4511452884441778e-07,
"loss": 0.5014,
"step": 5490
},
{
"epoch": 2.856029106029106,
"grad_norm": 0.5497820384951762,
"learning_rate": 1.400261802645575e-07,
"loss": 0.4951,
"step": 5495
},
{
"epoch": 2.858627858627859,
"grad_norm": 0.560040257401693,
"learning_rate": 1.350280119054881e-07,
"loss": 0.4907,
"step": 5500
},
{
"epoch": 2.8612266112266114,
"grad_norm": 0.5832689907805786,
"learning_rate": 1.3012006948091237e-07,
"loss": 0.4919,
"step": 5505
},
{
"epoch": 2.863825363825364,
"grad_norm": 0.5550041536720619,
"learning_rate": 1.2530239787932108e-07,
"loss": 0.4841,
"step": 5510
},
{
"epoch": 2.8664241164241164,
"grad_norm": 0.5548285935264545,
"learning_rate": 1.2057504116357865e-07,
"loss": 0.4957,
"step": 5515
},
{
"epoch": 2.869022869022869,
"grad_norm": 0.5573963669652322,
"learning_rate": 1.1593804257052143e-07,
"loss": 0.5003,
"step": 5520
},
{
"epoch": 2.8716216216216215,
"grad_norm": 0.573683837072622,
"learning_rate": 1.1139144451056016e-07,
"loss": 0.4917,
"step": 5525
},
{
"epoch": 2.874220374220374,
"grad_norm": 0.5955060126211607,
"learning_rate": 1.0693528856729918e-07,
"loss": 0.5077,
"step": 5530
},
{
"epoch": 2.876819126819127,
"grad_norm": 0.5823077256056483,
"learning_rate": 1.025696154971445e-07,
"loss": 0.4879,
"step": 5535
},
{
"epoch": 2.8794178794178795,
"grad_norm": 0.578110542140886,
"learning_rate": 9.829446522894193e-08,
"loss": 0.5007,
"step": 5540
},
{
"epoch": 2.882016632016632,
"grad_norm": 0.5745290388769638,
"learning_rate": 9.410987686360618e-08,
"loss": 0.4846,
"step": 5545
},
{
"epoch": 2.8846153846153846,
"grad_norm": 0.5645029409864777,
"learning_rate": 9.001588867376343e-08,
"loss": 0.4875,
"step": 5550
},
{
"epoch": 2.887214137214137,
"grad_norm": 0.5579791649018835,
"learning_rate": 8.601253810340493e-08,
"loss": 0.498,
"step": 5555
},
{
"epoch": 2.88981288981289,
"grad_norm": 0.5352826063441829,
"learning_rate": 8.209986176753947e-08,
"loss": 0.4929,
"step": 5560
},
{
"epoch": 2.892411642411642,
"grad_norm": 0.5406028586508593,
"learning_rate": 7.827789545186149e-08,
"loss": 0.493,
"step": 5565
},
{
"epoch": 2.895010395010395,
"grad_norm": 0.5735179042030664,
"learning_rate": 7.454667411242677e-08,
"loss": 0.4974,
"step": 5570
},
{
"epoch": 2.8976091476091477,
"grad_norm": 0.5884667672124062,
"learning_rate": 7.090623187532286e-08,
"loss": 0.4979,
"step": 5575
},
{
"epoch": 2.9002079002079,
"grad_norm": 0.5483035016900611,
"learning_rate": 6.735660203636918e-08,
"loss": 0.4905,
"step": 5580
},
{
"epoch": 2.9028066528066527,
"grad_norm": 0.5528971829398494,
"learning_rate": 6.389781706080289e-08,
"loss": 0.5122,
"step": 5585
},
{
"epoch": 2.9054054054054053,
"grad_norm": 0.6057996286720458,
"learning_rate": 6.052990858298801e-08,
"loss": 0.5028,
"step": 5590
},
{
"epoch": 2.9080041580041582,
"grad_norm": 0.5791623231565692,
"learning_rate": 5.7252907406123436e-08,
"loss": 0.4982,
"step": 5595
},
{
"epoch": 2.9106029106029108,
"grad_norm": 0.5627119775940129,
"learning_rate": 5.406684350195979e-08,
"loss": 0.4964,
"step": 5600
},
{
"epoch": 2.9132016632016633,
"grad_norm": 0.5559095452523849,
"learning_rate": 5.0971746010528566e-08,
"loss": 0.5063,
"step": 5605
},
{
"epoch": 2.915800415800416,
"grad_norm": 0.5535048888740743,
"learning_rate": 4.7967643239875686e-08,
"loss": 0.501,
"step": 5610
},
{
"epoch": 2.9183991683991684,
"grad_norm": 0.5462617861557779,
"learning_rate": 4.505456266579833e-08,
"loss": 0.5031,
"step": 5615
},
{
"epoch": 2.920997920997921,
"grad_norm": 0.5384884238791128,
"learning_rate": 4.22325309315963e-08,
"loss": 0.5019,
"step": 5620
},
{
"epoch": 2.9235966735966734,
"grad_norm": 0.5773761719166159,
"learning_rate": 3.950157384783104e-08,
"loss": 0.4939,
"step": 5625
},
{
"epoch": 2.9261954261954264,
"grad_norm": 0.5595211401825231,
"learning_rate": 3.68617163920848e-08,
"loss": 0.5007,
"step": 5630
},
{
"epoch": 2.928794178794179,
"grad_norm": 0.5658405364960404,
"learning_rate": 3.4312982708734065e-08,
"loss": 0.4806,
"step": 5635
},
{
"epoch": 2.9313929313929314,
"grad_norm": 0.5503957964422638,
"learning_rate": 3.1855396108730897e-08,
"loss": 0.5014,
"step": 5640
},
{
"epoch": 2.933991683991684,
"grad_norm": 0.5453959548244628,
"learning_rate": 2.9488979069387523e-08,
"loss": 0.4894,
"step": 5645
},
{
"epoch": 2.9365904365904365,
"grad_norm": 0.5693202599680172,
"learning_rate": 2.721375323416875e-08,
"loss": 0.4966,
"step": 5650
},
{
"epoch": 2.939189189189189,
"grad_norm": 0.5709168022581379,
"learning_rate": 2.5029739412497643e-08,
"loss": 0.4887,
"step": 5655
},
{
"epoch": 2.9417879417879416,
"grad_norm": 0.5402927981587441,
"learning_rate": 2.293695757956571e-08,
"loss": 0.4968,
"step": 5660
},
{
"epoch": 2.9443866943866945,
"grad_norm": 0.629131836957027,
"learning_rate": 2.0935426876144138e-08,
"loss": 0.4891,
"step": 5665
},
{
"epoch": 2.946985446985447,
"grad_norm": 0.5667833955309428,
"learning_rate": 1.9025165608418382e-08,
"loss": 0.4975,
"step": 5670
},
{
"epoch": 2.9495841995841996,
"grad_norm": 0.5618681057012622,
"learning_rate": 1.7206191247810533e-08,
"loss": 0.4949,
"step": 5675
},
{
"epoch": 2.952182952182952,
"grad_norm": 0.5467732554533196,
"learning_rate": 1.5478520430826095e-08,
"loss": 0.4985,
"step": 5680
},
{
"epoch": 2.9547817047817047,
"grad_norm": 0.566899525327703,
"learning_rate": 1.3842168958900782e-08,
"loss": 0.4978,
"step": 5685
},
{
"epoch": 2.9573804573804576,
"grad_norm": 0.548674729535616,
"learning_rate": 1.229715179825397e-08,
"loss": 0.5092,
"step": 5690
},
{
"epoch": 2.9599792099792097,
"grad_norm": 0.5855706553975555,
"learning_rate": 1.0843483079755468e-08,
"loss": 0.5036,
"step": 5695
},
{
"epoch": 2.9625779625779627,
"grad_norm": 0.574837155061116,
"learning_rate": 9.481176098788958e-09,
"loss": 0.5036,
"step": 5700
},
{
"epoch": 2.965176715176715,
"grad_norm": 0.5658141647440329,
"learning_rate": 8.210243315140976e-09,
"loss": 0.4972,
"step": 5705
},
{
"epoch": 2.9677754677754677,
"grad_norm": 0.5791629723335482,
"learning_rate": 7.030696352878786e-09,
"loss": 0.4942,
"step": 5710
},
{
"epoch": 2.9703742203742203,
"grad_norm": 0.5707304439471226,
"learning_rate": 5.942546000244909e-09,
"loss": 0.4946,
"step": 5715
},
{
"epoch": 2.972972972972973,
"grad_norm": 0.5873811028122294,
"learning_rate": 4.945802209562755e-09,
"loss": 0.4899,
"step": 5720
},
{
"epoch": 2.975571725571726,
"grad_norm": 0.5626457291531141,
"learning_rate": 4.0404740971433655e-09,
"loss": 0.4837,
"step": 5725
},
{
"epoch": 2.9781704781704783,
"grad_norm": 0.5482498449859082,
"learning_rate": 3.226569943197699e-09,
"loss": 0.4958,
"step": 5730
},
{
"epoch": 2.980769230769231,
"grad_norm": 0.57584453111696,
"learning_rate": 2.5040971917689172e-09,
"loss": 0.5065,
"step": 5735
},
{
"epoch": 2.9833679833679834,
"grad_norm": 0.6193525019087672,
"learning_rate": 1.873062450659102e-09,
"loss": 0.4947,
"step": 5740
},
{
"epoch": 2.985966735966736,
"grad_norm": 0.5534322935945964,
"learning_rate": 1.3334714913681989e-09,
"loss": 0.4968,
"step": 5745
},
{
"epoch": 2.9885654885654884,
"grad_norm": 0.5698273659048839,
"learning_rate": 8.853292490462739e-10,
"loss": 0.4965,
"step": 5750
},
{
"epoch": 2.991164241164241,
"grad_norm": 0.6072011011211793,
"learning_rate": 5.286398224413347e-10,
"loss": 0.4982,
"step": 5755
},
{
"epoch": 2.993762993762994,
"grad_norm": 0.5483495272455329,
"learning_rate": 2.6340647386935426e-10,
"loss": 0.4905,
"step": 5760
},
{
"epoch": 2.9963617463617465,
"grad_norm": 0.5468214431100177,
"learning_rate": 8.963162917763335e-11,
"loss": 0.4943,
"step": 5765
},
{
"epoch": 2.998960498960499,
"grad_norm": 0.554348853048081,
"learning_rate": 7.31687772592693e-12,
"loss": 0.4984,
"step": 5770
},
{
"epoch": 3.0,
"eval_loss": 0.8089174032211304,
"eval_runtime": 106.5642,
"eval_samples_per_second": 77.043,
"eval_steps_per_second": 1.211,
"step": 5772
},
{
"epoch": 3.0,
"step": 5772,
"total_flos": 1208539372584960.0,
"train_loss": 0.6196737293559317,
"train_runtime": 16925.3973,
"train_samples_per_second": 21.825,
"train_steps_per_second": 0.341
}
],
"logging_steps": 5,
"max_steps": 5772,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1208539372584960.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}