{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 5772, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005197505197505198, "grad_norm": 37.059080589232245, "learning_rate": 3.4602076124567476e-08, "loss": 2.0466, "step": 1 }, { "epoch": 0.002598752598752599, "grad_norm": 37.255614130391415, "learning_rate": 1.730103806228374e-07, "loss": 2.0444, "step": 5 }, { "epoch": 0.005197505197505198, "grad_norm": 38.706692454199526, "learning_rate": 3.460207612456748e-07, "loss": 2.0231, "step": 10 }, { "epoch": 0.007796257796257797, "grad_norm": 38.51739451223501, "learning_rate": 5.190311418685121e-07, "loss": 1.9458, "step": 15 }, { "epoch": 0.010395010395010396, "grad_norm": 10.810954056905189, "learning_rate": 6.920415224913496e-07, "loss": 1.842, "step": 20 }, { "epoch": 0.012993762993762994, "grad_norm": 5.592591424491592, "learning_rate": 8.650519031141868e-07, "loss": 1.736, "step": 25 }, { "epoch": 0.015592515592515593, "grad_norm": 4.235861784422552, "learning_rate": 1.0380622837370243e-06, "loss": 1.6427, "step": 30 }, { "epoch": 0.018191268191268192, "grad_norm": 3.005957983709911, "learning_rate": 1.2110726643598616e-06, "loss": 1.4661, "step": 35 }, { "epoch": 0.02079002079002079, "grad_norm": 2.1399033031159416, "learning_rate": 1.3840830449826992e-06, "loss": 1.3665, "step": 40 }, { "epoch": 0.02338877338877339, "grad_norm": 2.311859726684216, "learning_rate": 1.5570934256055365e-06, "loss": 1.2455, "step": 45 }, { "epoch": 0.02598752598752599, "grad_norm": 2.2333969372651588, "learning_rate": 1.7301038062283736e-06, "loss": 1.1655, "step": 50 }, { "epoch": 0.028586278586278588, "grad_norm": 1.4673684609803286, "learning_rate": 1.9031141868512112e-06, "loss": 1.0897, "step": 55 }, { "epoch": 0.031185031185031187, "grad_norm": 1.219673178403078, "learning_rate": 2.0761245674740485e-06, "loss": 1.0397, "step": 60 }, { "epoch": 0.033783783783783786, "grad_norm": 1.177265012600064, "learning_rate": 2.249134948096886e-06, "loss": 0.9918, "step": 65 }, { "epoch": 0.036382536382536385, "grad_norm": 1.1401627512076926, "learning_rate": 2.4221453287197232e-06, "loss": 0.9519, "step": 70 }, { "epoch": 0.03898128898128898, "grad_norm": 1.104171712944849, "learning_rate": 2.5951557093425604e-06, "loss": 0.9419, "step": 75 }, { "epoch": 0.04158004158004158, "grad_norm": 1.153098727216341, "learning_rate": 2.7681660899653983e-06, "loss": 0.9108, "step": 80 }, { "epoch": 0.04417879417879418, "grad_norm": 1.1902270145957274, "learning_rate": 2.9411764705882355e-06, "loss": 0.8827, "step": 85 }, { "epoch": 0.04677754677754678, "grad_norm": 1.246541694659634, "learning_rate": 3.114186851211073e-06, "loss": 0.8805, "step": 90 }, { "epoch": 0.04937629937629938, "grad_norm": 1.3692651531313507, "learning_rate": 3.28719723183391e-06, "loss": 0.872, "step": 95 }, { "epoch": 0.05197505197505198, "grad_norm": 1.2906179501144006, "learning_rate": 3.4602076124567473e-06, "loss": 0.8603, "step": 100 }, { "epoch": 0.05457380457380458, "grad_norm": 1.1975871593644642, "learning_rate": 3.6332179930795853e-06, "loss": 0.8401, "step": 105 }, { "epoch": 0.057172557172557176, "grad_norm": 1.0702701275027335, "learning_rate": 3.8062283737024224e-06, "loss": 0.8599, "step": 110 }, { "epoch": 0.059771309771309775, "grad_norm": 1.2062361616415083, "learning_rate": 3.9792387543252595e-06, "loss": 0.8437, "step": 115 }, { "epoch": 0.062370062370062374, "grad_norm": 1.0287364509894605, "learning_rate": 4.152249134948097e-06, "loss": 0.822, "step": 120 }, { "epoch": 0.06496881496881497, "grad_norm": 1.3847381271884296, "learning_rate": 4.325259515570935e-06, "loss": 0.8193, "step": 125 }, { "epoch": 0.06756756756756757, "grad_norm": 1.2610205489076338, "learning_rate": 4.498269896193772e-06, "loss": 0.8202, "step": 130 }, { "epoch": 0.07016632016632017, "grad_norm": 1.2296632878962366, "learning_rate": 4.67128027681661e-06, "loss": 0.8231, "step": 135 }, { "epoch": 0.07276507276507277, "grad_norm": 1.2968286734442396, "learning_rate": 4.8442906574394464e-06, "loss": 0.8196, "step": 140 }, { "epoch": 0.07536382536382537, "grad_norm": 1.1170405804017387, "learning_rate": 5.017301038062284e-06, "loss": 0.8174, "step": 145 }, { "epoch": 0.07796257796257797, "grad_norm": 1.0601393778580994, "learning_rate": 5.190311418685121e-06, "loss": 0.8095, "step": 150 }, { "epoch": 0.08056133056133057, "grad_norm": 1.0897540737796731, "learning_rate": 5.363321799307959e-06, "loss": 0.7995, "step": 155 }, { "epoch": 0.08316008316008316, "grad_norm": 1.2121810987520705, "learning_rate": 5.536332179930797e-06, "loss": 0.8207, "step": 160 }, { "epoch": 0.08575883575883576, "grad_norm": 1.1378189701749455, "learning_rate": 5.709342560553633e-06, "loss": 0.7946, "step": 165 }, { "epoch": 0.08835758835758836, "grad_norm": 1.0503529917982035, "learning_rate": 5.882352941176471e-06, "loss": 0.8116, "step": 170 }, { "epoch": 0.09095634095634096, "grad_norm": 1.128045536223591, "learning_rate": 6.055363321799308e-06, "loss": 0.7943, "step": 175 }, { "epoch": 0.09355509355509356, "grad_norm": 1.0062924807045572, "learning_rate": 6.228373702422146e-06, "loss": 0.7908, "step": 180 }, { "epoch": 0.09615384615384616, "grad_norm": 1.0645389567201315, "learning_rate": 6.401384083044984e-06, "loss": 0.7961, "step": 185 }, { "epoch": 0.09875259875259876, "grad_norm": 1.1414748404258819, "learning_rate": 6.57439446366782e-06, "loss": 0.7847, "step": 190 }, { "epoch": 0.10135135135135136, "grad_norm": 1.205384007751443, "learning_rate": 6.747404844290658e-06, "loss": 0.7751, "step": 195 }, { "epoch": 0.10395010395010396, "grad_norm": 1.1367398433720104, "learning_rate": 6.9204152249134946e-06, "loss": 0.7919, "step": 200 }, { "epoch": 0.10654885654885655, "grad_norm": 0.9307012296511041, "learning_rate": 7.093425605536333e-06, "loss": 0.7901, "step": 205 }, { "epoch": 0.10914760914760915, "grad_norm": 1.0367934940766987, "learning_rate": 7.2664359861591705e-06, "loss": 0.7895, "step": 210 }, { "epoch": 0.11174636174636175, "grad_norm": 0.96847574603506, "learning_rate": 7.439446366782007e-06, "loss": 0.7883, "step": 215 }, { "epoch": 0.11434511434511435, "grad_norm": 1.1618158896817028, "learning_rate": 7.612456747404845e-06, "loss": 0.7849, "step": 220 }, { "epoch": 0.11694386694386695, "grad_norm": 2.5717464884960584, "learning_rate": 7.785467128027683e-06, "loss": 0.7826, "step": 225 }, { "epoch": 0.11954261954261955, "grad_norm": 1.0453668836748238, "learning_rate": 7.958477508650519e-06, "loss": 0.7682, "step": 230 }, { "epoch": 0.12214137214137215, "grad_norm": 1.0156117659063706, "learning_rate": 8.131487889273357e-06, "loss": 0.7622, "step": 235 }, { "epoch": 0.12474012474012475, "grad_norm": 0.9498681526378566, "learning_rate": 8.304498269896194e-06, "loss": 0.7861, "step": 240 }, { "epoch": 0.12733887733887733, "grad_norm": 1.0843119677144408, "learning_rate": 8.477508650519032e-06, "loss": 0.7804, "step": 245 }, { "epoch": 0.12993762993762994, "grad_norm": 1.1528568447769787, "learning_rate": 8.65051903114187e-06, "loss": 0.7639, "step": 250 }, { "epoch": 0.13253638253638253, "grad_norm": 1.1295719141693836, "learning_rate": 8.823529411764707e-06, "loss": 0.7816, "step": 255 }, { "epoch": 0.13513513513513514, "grad_norm": 0.978783892464181, "learning_rate": 8.996539792387544e-06, "loss": 0.7672, "step": 260 }, { "epoch": 0.13773388773388773, "grad_norm": 0.9379534418690467, "learning_rate": 9.16955017301038e-06, "loss": 0.7702, "step": 265 }, { "epoch": 0.14033264033264034, "grad_norm": 1.1416793594082861, "learning_rate": 9.34256055363322e-06, "loss": 0.7738, "step": 270 }, { "epoch": 0.14293139293139293, "grad_norm": 0.9977084295945086, "learning_rate": 9.515570934256057e-06, "loss": 0.7696, "step": 275 }, { "epoch": 0.14553014553014554, "grad_norm": 1.037149328356884, "learning_rate": 9.688581314878893e-06, "loss": 0.7674, "step": 280 }, { "epoch": 0.14812889812889812, "grad_norm": 1.0165544577935077, "learning_rate": 9.86159169550173e-06, "loss": 0.7554, "step": 285 }, { "epoch": 0.15072765072765074, "grad_norm": 0.9713706199068332, "learning_rate": 1.0034602076124568e-05, "loss": 0.7771, "step": 290 }, { "epoch": 0.15332640332640332, "grad_norm": 1.0519199834853972, "learning_rate": 1.0207612456747407e-05, "loss": 0.7652, "step": 295 }, { "epoch": 0.15592515592515593, "grad_norm": 0.8942757233736588, "learning_rate": 1.0380622837370241e-05, "loss": 0.754, "step": 300 }, { "epoch": 0.15852390852390852, "grad_norm": 0.9760267256597028, "learning_rate": 1.055363321799308e-05, "loss": 0.7597, "step": 305 }, { "epoch": 0.16112266112266113, "grad_norm": 0.8750773264970739, "learning_rate": 1.0726643598615918e-05, "loss": 0.7552, "step": 310 }, { "epoch": 0.16372141372141372, "grad_norm": 0.9317612467807546, "learning_rate": 1.0899653979238756e-05, "loss": 0.7619, "step": 315 }, { "epoch": 0.16632016632016633, "grad_norm": 0.9585051662580469, "learning_rate": 1.1072664359861593e-05, "loss": 0.7562, "step": 320 }, { "epoch": 0.16891891891891891, "grad_norm": 0.9833066117827967, "learning_rate": 1.124567474048443e-05, "loss": 0.7668, "step": 325 }, { "epoch": 0.17151767151767153, "grad_norm": 0.9999136205277245, "learning_rate": 1.1418685121107267e-05, "loss": 0.7563, "step": 330 }, { "epoch": 0.1741164241164241, "grad_norm": 1.063190427210389, "learning_rate": 1.1591695501730104e-05, "loss": 0.7463, "step": 335 }, { "epoch": 0.17671517671517672, "grad_norm": 0.9998506539481437, "learning_rate": 1.1764705882352942e-05, "loss": 0.7533, "step": 340 }, { "epoch": 0.1793139293139293, "grad_norm": 0.929168293634566, "learning_rate": 1.1937716262975781e-05, "loss": 0.7466, "step": 345 }, { "epoch": 0.18191268191268192, "grad_norm": 1.018693960607738, "learning_rate": 1.2110726643598615e-05, "loss": 0.7489, "step": 350 }, { "epoch": 0.1845114345114345, "grad_norm": 0.9657161121572101, "learning_rate": 1.2283737024221455e-05, "loss": 0.7639, "step": 355 }, { "epoch": 0.18711018711018712, "grad_norm": 0.9274247630285816, "learning_rate": 1.2456747404844292e-05, "loss": 0.7458, "step": 360 }, { "epoch": 0.1897089397089397, "grad_norm": 0.8709049483455183, "learning_rate": 1.262975778546713e-05, "loss": 0.7542, "step": 365 }, { "epoch": 0.19230769230769232, "grad_norm": 0.8529475456705145, "learning_rate": 1.2802768166089967e-05, "loss": 0.7615, "step": 370 }, { "epoch": 0.1949064449064449, "grad_norm": 0.8834877993689659, "learning_rate": 1.2975778546712803e-05, "loss": 0.7555, "step": 375 }, { "epoch": 0.19750519750519752, "grad_norm": 0.8612036241498346, "learning_rate": 1.314878892733564e-05, "loss": 0.7455, "step": 380 }, { "epoch": 0.2001039501039501, "grad_norm": 0.972005034702574, "learning_rate": 1.3321799307958478e-05, "loss": 0.7335, "step": 385 }, { "epoch": 0.20270270270270271, "grad_norm": 0.8405468852505008, "learning_rate": 1.3494809688581316e-05, "loss": 0.7454, "step": 390 }, { "epoch": 0.2053014553014553, "grad_norm": 0.94483984497754, "learning_rate": 1.3667820069204153e-05, "loss": 0.7509, "step": 395 }, { "epoch": 0.2079002079002079, "grad_norm": 0.8568496193733218, "learning_rate": 1.3840830449826989e-05, "loss": 0.7386, "step": 400 }, { "epoch": 0.2104989604989605, "grad_norm": 0.9305490201344858, "learning_rate": 1.4013840830449827e-05, "loss": 0.7325, "step": 405 }, { "epoch": 0.2130977130977131, "grad_norm": 0.8391743588987977, "learning_rate": 1.4186851211072666e-05, "loss": 0.7394, "step": 410 }, { "epoch": 0.2156964656964657, "grad_norm": 0.8904148072363134, "learning_rate": 1.4359861591695503e-05, "loss": 0.7659, "step": 415 }, { "epoch": 0.2182952182952183, "grad_norm": 0.9494764558208273, "learning_rate": 1.4532871972318341e-05, "loss": 0.7303, "step": 420 }, { "epoch": 0.2208939708939709, "grad_norm": 0.8729324500601073, "learning_rate": 1.4705882352941179e-05, "loss": 0.7464, "step": 425 }, { "epoch": 0.2234927234927235, "grad_norm": 0.9426426724996545, "learning_rate": 1.4878892733564014e-05, "loss": 0.7425, "step": 430 }, { "epoch": 0.2260914760914761, "grad_norm": 0.8397393999023687, "learning_rate": 1.5051903114186852e-05, "loss": 0.7225, "step": 435 }, { "epoch": 0.2286902286902287, "grad_norm": 0.822001337030522, "learning_rate": 1.522491349480969e-05, "loss": 0.7514, "step": 440 }, { "epoch": 0.2312889812889813, "grad_norm": 0.8196560129735319, "learning_rate": 1.539792387543253e-05, "loss": 0.7455, "step": 445 }, { "epoch": 0.2338877338877339, "grad_norm": 0.9001216187487245, "learning_rate": 1.5570934256055366e-05, "loss": 0.7523, "step": 450 }, { "epoch": 0.23648648648648649, "grad_norm": 0.9230142554852074, "learning_rate": 1.57439446366782e-05, "loss": 0.7569, "step": 455 }, { "epoch": 0.2390852390852391, "grad_norm": 0.8290174186484409, "learning_rate": 1.5916955017301038e-05, "loss": 0.7428, "step": 460 }, { "epoch": 0.24168399168399168, "grad_norm": 0.829715213003188, "learning_rate": 1.6089965397923876e-05, "loss": 0.7457, "step": 465 }, { "epoch": 0.2442827442827443, "grad_norm": 0.8794988465121758, "learning_rate": 1.6262975778546713e-05, "loss": 0.7427, "step": 470 }, { "epoch": 0.24688149688149688, "grad_norm": 0.860878867890723, "learning_rate": 1.6435986159169554e-05, "loss": 0.727, "step": 475 }, { "epoch": 0.2494802494802495, "grad_norm": 0.8488363967170557, "learning_rate": 1.6608996539792388e-05, "loss": 0.7341, "step": 480 }, { "epoch": 0.2520790020790021, "grad_norm": 0.9024495405776305, "learning_rate": 1.6782006920415226e-05, "loss": 0.7445, "step": 485 }, { "epoch": 0.25467775467775466, "grad_norm": 0.8297075062381525, "learning_rate": 1.6955017301038063e-05, "loss": 0.7618, "step": 490 }, { "epoch": 0.25727650727650725, "grad_norm": 0.9522709115263103, "learning_rate": 1.71280276816609e-05, "loss": 0.7524, "step": 495 }, { "epoch": 0.2598752598752599, "grad_norm": 0.8862001527957881, "learning_rate": 1.730103806228374e-05, "loss": 0.7392, "step": 500 }, { "epoch": 0.2624740124740125, "grad_norm": 0.8190856457278167, "learning_rate": 1.7474048442906576e-05, "loss": 0.7348, "step": 505 }, { "epoch": 0.26507276507276506, "grad_norm": 0.8084717872755484, "learning_rate": 1.7647058823529414e-05, "loss": 0.7555, "step": 510 }, { "epoch": 0.26767151767151764, "grad_norm": 0.8143004262657276, "learning_rate": 1.782006920415225e-05, "loss": 0.7493, "step": 515 }, { "epoch": 0.2702702702702703, "grad_norm": 0.7439901950641524, "learning_rate": 1.799307958477509e-05, "loss": 0.7264, "step": 520 }, { "epoch": 0.27286902286902287, "grad_norm": 0.7802119529056604, "learning_rate": 1.8166089965397926e-05, "loss": 0.7484, "step": 525 }, { "epoch": 0.27546777546777546, "grad_norm": 0.7671220875794853, "learning_rate": 1.833910034602076e-05, "loss": 0.7365, "step": 530 }, { "epoch": 0.27806652806652804, "grad_norm": 0.8013129805690585, "learning_rate": 1.8512110726643598e-05, "loss": 0.7586, "step": 535 }, { "epoch": 0.2806652806652807, "grad_norm": 0.7731421991496061, "learning_rate": 1.868512110726644e-05, "loss": 0.7521, "step": 540 }, { "epoch": 0.28326403326403327, "grad_norm": 0.8183545102345747, "learning_rate": 1.8858131487889276e-05, "loss": 0.7379, "step": 545 }, { "epoch": 0.28586278586278585, "grad_norm": 0.761114380449014, "learning_rate": 1.9031141868512114e-05, "loss": 0.7489, "step": 550 }, { "epoch": 0.28846153846153844, "grad_norm": 0.797967905949635, "learning_rate": 1.9204152249134948e-05, "loss": 0.7475, "step": 555 }, { "epoch": 0.2910602910602911, "grad_norm": 0.8141772027308778, "learning_rate": 1.9377162629757786e-05, "loss": 0.7403, "step": 560 }, { "epoch": 0.29365904365904366, "grad_norm": 1.0002732271715242, "learning_rate": 1.9550173010380623e-05, "loss": 0.7446, "step": 565 }, { "epoch": 0.29625779625779625, "grad_norm": 0.7424317625579876, "learning_rate": 1.972318339100346e-05, "loss": 0.7432, "step": 570 }, { "epoch": 0.29885654885654883, "grad_norm": 0.7975265418685308, "learning_rate": 1.98961937716263e-05, "loss": 0.7425, "step": 575 }, { "epoch": 0.30145530145530147, "grad_norm": 0.8527920792318469, "learning_rate": 1.9999992683122277e-05, "loss": 0.7313, "step": 580 }, { "epoch": 0.30405405405405406, "grad_norm": 0.7826703424284943, "learning_rate": 1.9999910368370826e-05, "loss": 0.7404, "step": 585 }, { "epoch": 0.30665280665280664, "grad_norm": 0.7942647670210833, "learning_rate": 1.9999736593526133e-05, "loss": 0.7263, "step": 590 }, { "epoch": 0.3092515592515592, "grad_norm": 0.7552220975281737, "learning_rate": 1.999947136017756e-05, "loss": 0.7353, "step": 595 }, { "epoch": 0.31185031185031187, "grad_norm": 0.7790597350916263, "learning_rate": 1.9999114670750955e-05, "loss": 0.7478, "step": 600 }, { "epoch": 0.31444906444906445, "grad_norm": 0.7982754500449706, "learning_rate": 1.9998666528508632e-05, "loss": 0.7414, "step": 605 }, { "epoch": 0.31704781704781704, "grad_norm": 0.8159770553033799, "learning_rate": 1.9998126937549343e-05, "loss": 0.7285, "step": 610 }, { "epoch": 0.3196465696465696, "grad_norm": 0.8888821616512309, "learning_rate": 1.9997495902808233e-05, "loss": 0.751, "step": 615 }, { "epoch": 0.32224532224532226, "grad_norm": 0.7544060206964511, "learning_rate": 1.9996773430056806e-05, "loss": 0.7385, "step": 620 }, { "epoch": 0.32484407484407485, "grad_norm": 0.7895944868586088, "learning_rate": 1.9995959525902856e-05, "loss": 0.7369, "step": 625 }, { "epoch": 0.32744282744282743, "grad_norm": 0.7602727085172243, "learning_rate": 1.999505419779044e-05, "loss": 0.757, "step": 630 }, { "epoch": 0.33004158004158, "grad_norm": 0.8764699729246701, "learning_rate": 1.9994057453999754e-05, "loss": 0.738, "step": 635 }, { "epoch": 0.33264033264033266, "grad_norm": 0.7647288391752125, "learning_rate": 1.9992969303647124e-05, "loss": 0.7478, "step": 640 }, { "epoch": 0.33523908523908524, "grad_norm": 0.7069726215488147, "learning_rate": 1.999178975668486e-05, "loss": 0.7149, "step": 645 }, { "epoch": 0.33783783783783783, "grad_norm": 0.6497220082269107, "learning_rate": 1.9990518823901213e-05, "loss": 0.7496, "step": 650 }, { "epoch": 0.3404365904365904, "grad_norm": 0.6610573730423013, "learning_rate": 1.9989156516920248e-05, "loss": 0.7297, "step": 655 }, { "epoch": 0.34303534303534305, "grad_norm": 0.6668607876517594, "learning_rate": 1.9987702848201748e-05, "loss": 0.7193, "step": 660 }, { "epoch": 0.34563409563409564, "grad_norm": 0.7860493254567829, "learning_rate": 1.99861578310411e-05, "loss": 0.7374, "step": 665 }, { "epoch": 0.3482328482328482, "grad_norm": 0.8925562926124014, "learning_rate": 1.9984521479569176e-05, "loss": 0.7237, "step": 670 }, { "epoch": 0.3508316008316008, "grad_norm": 0.7672501463240459, "learning_rate": 1.9982793808752193e-05, "loss": 0.7306, "step": 675 }, { "epoch": 0.35343035343035345, "grad_norm": 0.7502385743686751, "learning_rate": 1.9980974834391583e-05, "loss": 0.7406, "step": 680 }, { "epoch": 0.35602910602910603, "grad_norm": 0.7564023469276626, "learning_rate": 1.997906457312386e-05, "loss": 0.7354, "step": 685 }, { "epoch": 0.3586278586278586, "grad_norm": 0.7147365409493106, "learning_rate": 1.9977063042420438e-05, "loss": 0.7312, "step": 690 }, { "epoch": 0.3612266112266112, "grad_norm": 0.8321336652388966, "learning_rate": 1.9974970260587507e-05, "loss": 0.7364, "step": 695 }, { "epoch": 0.36382536382536385, "grad_norm": 0.6733104515770179, "learning_rate": 1.9972786246765832e-05, "loss": 0.7273, "step": 700 }, { "epoch": 0.36642411642411643, "grad_norm": 0.6923993722045795, "learning_rate": 1.9970511020930612e-05, "loss": 0.7259, "step": 705 }, { "epoch": 0.369022869022869, "grad_norm": 0.7106843030691585, "learning_rate": 1.9968144603891272e-05, "loss": 0.7409, "step": 710 }, { "epoch": 0.3716216216216216, "grad_norm": 0.6074610563242314, "learning_rate": 1.9965687017291268e-05, "loss": 0.7237, "step": 715 }, { "epoch": 0.37422037422037424, "grad_norm": 0.6798181846974808, "learning_rate": 1.9963138283607918e-05, "loss": 0.7189, "step": 720 }, { "epoch": 0.3768191268191268, "grad_norm": 0.7233959402973988, "learning_rate": 1.996049842615217e-05, "loss": 0.7524, "step": 725 }, { "epoch": 0.3794178794178794, "grad_norm": 0.7818955262414797, "learning_rate": 1.9957767469068405e-05, "loss": 0.7259, "step": 730 }, { "epoch": 0.382016632016632, "grad_norm": 0.7248772563760029, "learning_rate": 1.9954945437334204e-05, "loss": 0.7312, "step": 735 }, { "epoch": 0.38461538461538464, "grad_norm": 0.7079790287253178, "learning_rate": 1.9952032356760125e-05, "loss": 0.7041, "step": 740 }, { "epoch": 0.3872141372141372, "grad_norm": 0.7390341417388404, "learning_rate": 1.994902825398947e-05, "loss": 0.7133, "step": 745 }, { "epoch": 0.3898128898128898, "grad_norm": 0.8111822578128921, "learning_rate": 1.9945933156498043e-05, "loss": 0.729, "step": 750 }, { "epoch": 0.3924116424116424, "grad_norm": 0.7769690688975751, "learning_rate": 1.9942747092593877e-05, "loss": 0.715, "step": 755 }, { "epoch": 0.39501039501039503, "grad_norm": 0.6870020477467483, "learning_rate": 1.9939470091417012e-05, "loss": 0.7132, "step": 760 }, { "epoch": 0.3976091476091476, "grad_norm": 0.6911704853875393, "learning_rate": 1.99361021829392e-05, "loss": 0.7206, "step": 765 }, { "epoch": 0.4002079002079002, "grad_norm": 0.6600143593403244, "learning_rate": 1.993264339796363e-05, "loss": 0.7145, "step": 770 }, { "epoch": 0.4028066528066528, "grad_norm": 0.6726432015084747, "learning_rate": 1.992909376812468e-05, "loss": 0.739, "step": 775 }, { "epoch": 0.40540540540540543, "grad_norm": 0.70615782283502, "learning_rate": 1.9925453325887574e-05, "loss": 0.7222, "step": 780 }, { "epoch": 0.408004158004158, "grad_norm": 0.5887807785936404, "learning_rate": 1.992172210454814e-05, "loss": 0.7221, "step": 785 }, { "epoch": 0.4106029106029106, "grad_norm": 0.6357715206079563, "learning_rate": 1.991790013823246e-05, "loss": 0.717, "step": 790 }, { "epoch": 0.4132016632016632, "grad_norm": 0.6710571738289492, "learning_rate": 1.9913987461896597e-05, "loss": 0.7299, "step": 795 }, { "epoch": 0.4158004158004158, "grad_norm": 0.66534846887862, "learning_rate": 1.990998411132624e-05, "loss": 0.719, "step": 800 }, { "epoch": 0.4183991683991684, "grad_norm": 0.6658127042254826, "learning_rate": 1.9905890123136396e-05, "loss": 0.7156, "step": 805 }, { "epoch": 0.420997920997921, "grad_norm": 0.7461519732525459, "learning_rate": 1.990170553477106e-05, "loss": 0.7281, "step": 810 }, { "epoch": 0.4235966735966736, "grad_norm": 0.7960568157470115, "learning_rate": 1.9897430384502857e-05, "loss": 0.7229, "step": 815 }, { "epoch": 0.4261954261954262, "grad_norm": 0.7377717323529744, "learning_rate": 1.9893064711432702e-05, "loss": 0.7207, "step": 820 }, { "epoch": 0.4287941787941788, "grad_norm": 0.6333020229736416, "learning_rate": 1.988860855548944e-05, "loss": 0.7104, "step": 825 }, { "epoch": 0.4313929313929314, "grad_norm": 0.6099863945288464, "learning_rate": 1.988406195742948e-05, "loss": 0.7203, "step": 830 }, { "epoch": 0.433991683991684, "grad_norm": 0.6785164904650527, "learning_rate": 1.987942495883642e-05, "loss": 0.711, "step": 835 }, { "epoch": 0.4365904365904366, "grad_norm": 0.6622509941324428, "learning_rate": 1.9874697602120682e-05, "loss": 0.7325, "step": 840 }, { "epoch": 0.4391891891891892, "grad_norm": 0.6613983198156271, "learning_rate": 1.986987993051909e-05, "loss": 0.7233, "step": 845 }, { "epoch": 0.4417879417879418, "grad_norm": 0.6956512036405856, "learning_rate": 1.9864971988094515e-05, "loss": 0.7207, "step": 850 }, { "epoch": 0.44438669438669437, "grad_norm": 0.6797121388808018, "learning_rate": 1.9859973819735443e-05, "loss": 0.7359, "step": 855 }, { "epoch": 0.446985446985447, "grad_norm": 0.6634402820404799, "learning_rate": 1.9854885471155586e-05, "loss": 0.7094, "step": 860 }, { "epoch": 0.4495841995841996, "grad_norm": 0.6337794885487144, "learning_rate": 1.9849706988893433e-05, "loss": 0.7276, "step": 865 }, { "epoch": 0.4521829521829522, "grad_norm": 0.6734603186331721, "learning_rate": 1.9844438420311863e-05, "loss": 0.7142, "step": 870 }, { "epoch": 0.45478170478170477, "grad_norm": 0.7066192892075979, "learning_rate": 1.9839079813597687e-05, "loss": 0.7149, "step": 875 }, { "epoch": 0.4573804573804574, "grad_norm": 0.6964600685285819, "learning_rate": 1.9833631217761204e-05, "loss": 0.7281, "step": 880 }, { "epoch": 0.45997920997921, "grad_norm": 0.7109456157271579, "learning_rate": 1.9828092682635774e-05, "loss": 0.7332, "step": 885 }, { "epoch": 0.4625779625779626, "grad_norm": 0.669236169004855, "learning_rate": 1.9822464258877345e-05, "loss": 0.7293, "step": 890 }, { "epoch": 0.46517671517671516, "grad_norm": 0.6046679594816758, "learning_rate": 1.9816745997963996e-05, "loss": 0.706, "step": 895 }, { "epoch": 0.4677754677754678, "grad_norm": 0.6553388635341802, "learning_rate": 1.981093795219546e-05, "loss": 0.7136, "step": 900 }, { "epoch": 0.4703742203742204, "grad_norm": 0.6699423752938592, "learning_rate": 1.980504017469265e-05, "loss": 0.7056, "step": 905 }, { "epoch": 0.47297297297297297, "grad_norm": 0.637041202100537, "learning_rate": 1.9799052719397188e-05, "loss": 0.7221, "step": 910 }, { "epoch": 0.47557172557172556, "grad_norm": 0.657103082344547, "learning_rate": 1.979297564107088e-05, "loss": 0.7271, "step": 915 }, { "epoch": 0.4781704781704782, "grad_norm": 0.6885064889983316, "learning_rate": 1.978680899529524e-05, "loss": 0.7159, "step": 920 }, { "epoch": 0.4807692307692308, "grad_norm": 0.7043512832125569, "learning_rate": 1.9780552838470976e-05, "loss": 0.7057, "step": 925 }, { "epoch": 0.48336798336798337, "grad_norm": 0.6627639804543833, "learning_rate": 1.977420722781746e-05, "loss": 0.7194, "step": 930 }, { "epoch": 0.48596673596673595, "grad_norm": 0.7069767251302125, "learning_rate": 1.976777222137224e-05, "loss": 0.7144, "step": 935 }, { "epoch": 0.4885654885654886, "grad_norm": 0.6090332104645865, "learning_rate": 1.9761247877990465e-05, "loss": 0.7161, "step": 940 }, { "epoch": 0.4911642411642412, "grad_norm": 0.7085090978015706, "learning_rate": 1.9754634257344376e-05, "loss": 0.733, "step": 945 }, { "epoch": 0.49376299376299376, "grad_norm": 0.6858008371045625, "learning_rate": 1.9747931419922756e-05, "loss": 0.7271, "step": 950 }, { "epoch": 0.49636174636174635, "grad_norm": 0.6543468152194417, "learning_rate": 1.974113942703036e-05, "loss": 0.7052, "step": 955 }, { "epoch": 0.498960498960499, "grad_norm": 0.7013937541029002, "learning_rate": 1.9734258340787376e-05, "loss": 0.7233, "step": 960 }, { "epoch": 0.5015592515592515, "grad_norm": 0.6660787930797433, "learning_rate": 1.9727288224128852e-05, "loss": 0.7196, "step": 965 }, { "epoch": 0.5041580041580042, "grad_norm": 0.6541474437978503, "learning_rate": 1.972022914080411e-05, "loss": 0.7061, "step": 970 }, { "epoch": 0.5067567567567568, "grad_norm": 0.66883512467633, "learning_rate": 1.971308115537617e-05, "loss": 0.7146, "step": 975 }, { "epoch": 0.5093555093555093, "grad_norm": 0.6381037219289445, "learning_rate": 1.970584433322116e-05, "loss": 0.7087, "step": 980 }, { "epoch": 0.511954261954262, "grad_norm": 1.1110934200883047, "learning_rate": 1.969851874052771e-05, "loss": 0.73, "step": 985 }, { "epoch": 0.5145530145530145, "grad_norm": 0.6310880004216817, "learning_rate": 1.969110444429637e-05, "loss": 0.7183, "step": 990 }, { "epoch": 0.5171517671517671, "grad_norm": 0.6410220872400427, "learning_rate": 1.9683601512338963e-05, "loss": 0.7086, "step": 995 }, { "epoch": 0.5197505197505198, "grad_norm": 0.5996299242899708, "learning_rate": 1.9676010013277994e-05, "loss": 0.7267, "step": 1000 }, { "epoch": 0.5223492723492723, "grad_norm": 0.6590949790761541, "learning_rate": 1.9668330016546004e-05, "loss": 0.7374, "step": 1005 }, { "epoch": 0.524948024948025, "grad_norm": 0.6221180388276606, "learning_rate": 1.9660561592384946e-05, "loss": 0.7249, "step": 1010 }, { "epoch": 0.5275467775467776, "grad_norm": 0.6026562255053431, "learning_rate": 1.965270481184553e-05, "loss": 0.7092, "step": 1015 }, { "epoch": 0.5301455301455301, "grad_norm": 0.6076685489615162, "learning_rate": 1.9644759746786598e-05, "loss": 0.7144, "step": 1020 }, { "epoch": 0.5327442827442828, "grad_norm": 0.6466092379791593, "learning_rate": 1.9636726469874437e-05, "loss": 0.7021, "step": 1025 }, { "epoch": 0.5353430353430353, "grad_norm": 0.6614878159031965, "learning_rate": 1.962860505458213e-05, "loss": 0.7147, "step": 1030 }, { "epoch": 0.5379417879417879, "grad_norm": 0.642038696775677, "learning_rate": 1.962039557518888e-05, "loss": 0.7064, "step": 1035 }, { "epoch": 0.5405405405405406, "grad_norm": 0.6053359618387539, "learning_rate": 1.961209810677934e-05, "loss": 0.7103, "step": 1040 }, { "epoch": 0.5431392931392931, "grad_norm": 0.6208076453451457, "learning_rate": 1.960371272524291e-05, "loss": 0.717, "step": 1045 }, { "epoch": 0.5457380457380457, "grad_norm": 0.6668283857181149, "learning_rate": 1.9595239507273058e-05, "loss": 0.7048, "step": 1050 }, { "epoch": 0.5483367983367984, "grad_norm": 0.6229130725064413, "learning_rate": 1.9586678530366607e-05, "loss": 0.7159, "step": 1055 }, { "epoch": 0.5509355509355509, "grad_norm": 0.5933645289790093, "learning_rate": 1.9578029872823038e-05, "loss": 0.7131, "step": 1060 }, { "epoch": 0.5535343035343036, "grad_norm": 0.5857860624157782, "learning_rate": 1.9569293613743753e-05, "loss": 0.7037, "step": 1065 }, { "epoch": 0.5561330561330561, "grad_norm": 0.6425150123453736, "learning_rate": 1.9560469833031383e-05, "loss": 0.7098, "step": 1070 }, { "epoch": 0.5587318087318087, "grad_norm": 0.619116481041439, "learning_rate": 1.955155861138903e-05, "loss": 0.7176, "step": 1075 }, { "epoch": 0.5613305613305614, "grad_norm": 0.635662450753945, "learning_rate": 1.9542560030319543e-05, "loss": 0.7104, "step": 1080 }, { "epoch": 0.5639293139293139, "grad_norm": 0.6273126414001168, "learning_rate": 1.9533474172124763e-05, "loss": 0.7144, "step": 1085 }, { "epoch": 0.5665280665280665, "grad_norm": 0.5746605470244842, "learning_rate": 1.952430111990478e-05, "loss": 0.7058, "step": 1090 }, { "epoch": 0.5691268191268192, "grad_norm": 0.616041790905867, "learning_rate": 1.9515040957557162e-05, "loss": 0.7144, "step": 1095 }, { "epoch": 0.5717255717255717, "grad_norm": 0.5604195549287683, "learning_rate": 1.950569376977621e-05, "loss": 0.7045, "step": 1100 }, { "epoch": 0.5743243243243243, "grad_norm": 0.5873428291768331, "learning_rate": 1.9496259642052146e-05, "loss": 0.7121, "step": 1105 }, { "epoch": 0.5769230769230769, "grad_norm": 0.6316260348752082, "learning_rate": 1.9486738660670373e-05, "loss": 0.7147, "step": 1110 }, { "epoch": 0.5795218295218295, "grad_norm": 0.5874076313405716, "learning_rate": 1.9477130912710648e-05, "loss": 0.7279, "step": 1115 }, { "epoch": 0.5821205821205822, "grad_norm": 0.6676704495322479, "learning_rate": 1.9467436486046317e-05, "loss": 0.7103, "step": 1120 }, { "epoch": 0.5847193347193347, "grad_norm": 0.5827519526288305, "learning_rate": 1.9457655469343482e-05, "loss": 0.7014, "step": 1125 }, { "epoch": 0.5873180873180873, "grad_norm": 0.6037871799524279, "learning_rate": 1.944778795206023e-05, "loss": 0.7053, "step": 1130 }, { "epoch": 0.58991683991684, "grad_norm": 0.5712930594682987, "learning_rate": 1.9437834024445762e-05, "loss": 0.7177, "step": 1135 }, { "epoch": 0.5925155925155925, "grad_norm": 0.5970793241519367, "learning_rate": 1.9427793777539615e-05, "loss": 0.7127, "step": 1140 }, { "epoch": 0.5951143451143451, "grad_norm": 0.6209626687697077, "learning_rate": 1.9417667303170803e-05, "loss": 0.7063, "step": 1145 }, { "epoch": 0.5977130977130977, "grad_norm": 0.606055879020448, "learning_rate": 1.940745469395698e-05, "loss": 0.695, "step": 1150 }, { "epoch": 0.6003118503118503, "grad_norm": 0.6188361477212587, "learning_rate": 1.9397156043303608e-05, "loss": 0.6966, "step": 1155 }, { "epoch": 0.6029106029106029, "grad_norm": 0.6290824936609826, "learning_rate": 1.9386771445403086e-05, "loss": 0.7031, "step": 1160 }, { "epoch": 0.6055093555093555, "grad_norm": 0.6122110814030293, "learning_rate": 1.9376300995233894e-05, "loss": 0.7083, "step": 1165 }, { "epoch": 0.6081081081081081, "grad_norm": 0.6273881899574172, "learning_rate": 1.9365744788559725e-05, "loss": 0.7023, "step": 1170 }, { "epoch": 0.6107068607068608, "grad_norm": 0.5897253804349057, "learning_rate": 1.9355102921928606e-05, "loss": 0.7141, "step": 1175 }, { "epoch": 0.6133056133056133, "grad_norm": 0.6492672708845987, "learning_rate": 1.9344375492672024e-05, "loss": 0.7097, "step": 1180 }, { "epoch": 0.6159043659043659, "grad_norm": 0.5984352418529706, "learning_rate": 1.9333562598904027e-05, "loss": 0.7068, "step": 1185 }, { "epoch": 0.6185031185031185, "grad_norm": 0.6401953574979402, "learning_rate": 1.9322664339520328e-05, "loss": 0.7007, "step": 1190 }, { "epoch": 0.6211018711018711, "grad_norm": 0.6292156047444384, "learning_rate": 1.93116808141974e-05, "loss": 0.7114, "step": 1195 }, { "epoch": 0.6237006237006237, "grad_norm": 0.6298280430381119, "learning_rate": 1.9300612123391574e-05, "loss": 0.7224, "step": 1200 }, { "epoch": 0.6262993762993763, "grad_norm": 0.5647290693137603, "learning_rate": 1.92894583683381e-05, "loss": 0.7029, "step": 1205 }, { "epoch": 0.6288981288981289, "grad_norm": 0.5843497274151073, "learning_rate": 1.927821965105024e-05, "loss": 0.6935, "step": 1210 }, { "epoch": 0.6314968814968815, "grad_norm": 0.5742329412422685, "learning_rate": 1.9266896074318335e-05, "loss": 0.6921, "step": 1215 }, { "epoch": 0.6340956340956341, "grad_norm": 0.6198304134928966, "learning_rate": 1.925548774170885e-05, "loss": 0.7022, "step": 1220 }, { "epoch": 0.6366943866943867, "grad_norm": 0.5927355008313566, "learning_rate": 1.924399475756343e-05, "loss": 0.7043, "step": 1225 }, { "epoch": 0.6392931392931392, "grad_norm": 0.5675856487929543, "learning_rate": 1.9232417226997964e-05, "loss": 0.6979, "step": 1230 }, { "epoch": 0.6418918918918919, "grad_norm": 0.5801729136751573, "learning_rate": 1.9220755255901604e-05, "loss": 0.7128, "step": 1235 }, { "epoch": 0.6444906444906445, "grad_norm": 0.5455856005670234, "learning_rate": 1.92090089509358e-05, "loss": 0.7154, "step": 1240 }, { "epoch": 0.6470893970893971, "grad_norm": 0.578411372283767, "learning_rate": 1.9197178419533328e-05, "loss": 0.726, "step": 1245 }, { "epoch": 0.6496881496881497, "grad_norm": 0.6165732640247198, "learning_rate": 1.918526376989731e-05, "loss": 0.7097, "step": 1250 }, { "epoch": 0.6522869022869023, "grad_norm": 0.579722849123064, "learning_rate": 1.9173265111000218e-05, "loss": 0.7181, "step": 1255 }, { "epoch": 0.6548856548856549, "grad_norm": 0.6384864268465269, "learning_rate": 1.9161182552582885e-05, "loss": 0.7048, "step": 1260 }, { "epoch": 0.6574844074844075, "grad_norm": 0.5442756986247173, "learning_rate": 1.9149016205153494e-05, "loss": 0.6983, "step": 1265 }, { "epoch": 0.66008316008316, "grad_norm": 0.5876328008368029, "learning_rate": 1.9136766179986566e-05, "loss": 0.7058, "step": 1270 }, { "epoch": 0.6626819126819127, "grad_norm": 0.556923286518879, "learning_rate": 1.9124432589121945e-05, "loss": 0.7048, "step": 1275 }, { "epoch": 0.6652806652806653, "grad_norm": 0.5614004903256652, "learning_rate": 1.9112015545363793e-05, "loss": 0.703, "step": 1280 }, { "epoch": 0.6678794178794178, "grad_norm": 0.6029085739466059, "learning_rate": 1.9099515162279515e-05, "loss": 0.7149, "step": 1285 }, { "epoch": 0.6704781704781705, "grad_norm": 0.5542833475447663, "learning_rate": 1.9086931554198756e-05, "loss": 0.7059, "step": 1290 }, { "epoch": 0.6730769230769231, "grad_norm": 0.596418111214614, "learning_rate": 1.907426483621235e-05, "loss": 0.7187, "step": 1295 }, { "epoch": 0.6756756756756757, "grad_norm": 0.5964335487077739, "learning_rate": 1.9061515124171254e-05, "loss": 0.7023, "step": 1300 }, { "epoch": 0.6782744282744283, "grad_norm": 0.6527135172773815, "learning_rate": 1.90486825346855e-05, "loss": 0.6985, "step": 1305 }, { "epoch": 0.6808731808731808, "grad_norm": 0.5854908036035414, "learning_rate": 1.9035767185123118e-05, "loss": 0.7097, "step": 1310 }, { "epoch": 0.6834719334719335, "grad_norm": 0.5630331628185049, "learning_rate": 1.9022769193609077e-05, "loss": 0.6973, "step": 1315 }, { "epoch": 0.6860706860706861, "grad_norm": 0.5872323843899289, "learning_rate": 1.900968867902419e-05, "loss": 0.7069, "step": 1320 }, { "epoch": 0.6886694386694386, "grad_norm": 0.5845474538391455, "learning_rate": 1.899652576100405e-05, "loss": 0.7169, "step": 1325 }, { "epoch": 0.6912681912681913, "grad_norm": 0.6164999248623418, "learning_rate": 1.8983280559937896e-05, "loss": 0.7005, "step": 1330 }, { "epoch": 0.6938669438669439, "grad_norm": 0.6124510306800306, "learning_rate": 1.896995319696755e-05, "loss": 0.701, "step": 1335 }, { "epoch": 0.6964656964656964, "grad_norm": 0.6267010331850633, "learning_rate": 1.8956543793986287e-05, "loss": 0.7164, "step": 1340 }, { "epoch": 0.6990644490644491, "grad_norm": 0.5961399562877898, "learning_rate": 1.8943052473637734e-05, "loss": 0.7213, "step": 1345 }, { "epoch": 0.7016632016632016, "grad_norm": 0.6174817199293855, "learning_rate": 1.8929479359314742e-05, "loss": 0.6985, "step": 1350 }, { "epoch": 0.7042619542619543, "grad_norm": 0.5851247926140993, "learning_rate": 1.891582457515825e-05, "loss": 0.6935, "step": 1355 }, { "epoch": 0.7068607068607069, "grad_norm": 0.5776477138388799, "learning_rate": 1.890208824605616e-05, "loss": 0.708, "step": 1360 }, { "epoch": 0.7094594594594594, "grad_norm": 0.5309187069380664, "learning_rate": 1.888827049764219e-05, "loss": 0.7003, "step": 1365 }, { "epoch": 0.7120582120582121, "grad_norm": 0.5496529326574807, "learning_rate": 1.8874371456294732e-05, "loss": 0.6999, "step": 1370 }, { "epoch": 0.7146569646569647, "grad_norm": 0.5339269514909717, "learning_rate": 1.8860391249135692e-05, "loss": 0.6966, "step": 1375 }, { "epoch": 0.7172557172557172, "grad_norm": 0.5427973139574223, "learning_rate": 1.884633000402931e-05, "loss": 0.6936, "step": 1380 }, { "epoch": 0.7198544698544699, "grad_norm": 0.5672590602791164, "learning_rate": 1.883218784958103e-05, "loss": 0.689, "step": 1385 }, { "epoch": 0.7224532224532224, "grad_norm": 0.5402673826941384, "learning_rate": 1.8817964915136277e-05, "loss": 0.7072, "step": 1390 }, { "epoch": 0.725051975051975, "grad_norm": 0.5601951835510618, "learning_rate": 1.8803661330779316e-05, "loss": 0.7059, "step": 1395 }, { "epoch": 0.7276507276507277, "grad_norm": 0.5857868039965994, "learning_rate": 1.8789277227332025e-05, "loss": 0.6799, "step": 1400 }, { "epoch": 0.7302494802494802, "grad_norm": 0.5347885501584507, "learning_rate": 1.877481273635274e-05, "loss": 0.6956, "step": 1405 }, { "epoch": 0.7328482328482329, "grad_norm": 0.5502849975189612, "learning_rate": 1.8760267990135007e-05, "loss": 0.7059, "step": 1410 }, { "epoch": 0.7354469854469855, "grad_norm": 0.5178257228797314, "learning_rate": 1.874564312170641e-05, "loss": 0.7019, "step": 1415 }, { "epoch": 0.738045738045738, "grad_norm": 0.5607208259193451, "learning_rate": 1.8730938264827322e-05, "loss": 0.6963, "step": 1420 }, { "epoch": 0.7406444906444907, "grad_norm": 0.5821162244405798, "learning_rate": 1.8716153553989716e-05, "loss": 0.6965, "step": 1425 }, { "epoch": 0.7432432432432432, "grad_norm": 0.5495747594677731, "learning_rate": 1.8701289124415902e-05, "loss": 0.6963, "step": 1430 }, { "epoch": 0.7458419958419958, "grad_norm": 0.528297292924797, "learning_rate": 1.868634511205731e-05, "loss": 0.6917, "step": 1435 }, { "epoch": 0.7484407484407485, "grad_norm": 0.5326976953811587, "learning_rate": 1.8671321653593244e-05, "loss": 0.6989, "step": 1440 }, { "epoch": 0.751039501039501, "grad_norm": 0.5584186177167862, "learning_rate": 1.8656218886429624e-05, "loss": 0.7031, "step": 1445 }, { "epoch": 0.7536382536382537, "grad_norm": 0.5570198454055475, "learning_rate": 1.8641036948697736e-05, "loss": 0.7023, "step": 1450 }, { "epoch": 0.7562370062370062, "grad_norm": 0.6760644666500142, "learning_rate": 1.8625775979252976e-05, "loss": 0.6789, "step": 1455 }, { "epoch": 0.7588357588357588, "grad_norm": 0.5638434234347486, "learning_rate": 1.8610436117673557e-05, "loss": 0.6986, "step": 1460 }, { "epoch": 0.7614345114345115, "grad_norm": 0.5493778378867652, "learning_rate": 1.8595017504259253e-05, "loss": 0.6785, "step": 1465 }, { "epoch": 0.764033264033264, "grad_norm": 0.6031826832296197, "learning_rate": 1.8579520280030118e-05, "loss": 0.6995, "step": 1470 }, { "epoch": 0.7666320166320166, "grad_norm": 0.5143780295012962, "learning_rate": 1.8563944586725175e-05, "loss": 0.6846, "step": 1475 }, { "epoch": 0.7692307692307693, "grad_norm": 0.5539515728601708, "learning_rate": 1.8548290566801132e-05, "loss": 0.7238, "step": 1480 }, { "epoch": 0.7718295218295218, "grad_norm": 0.5421409786755411, "learning_rate": 1.853255836343109e-05, "loss": 0.6999, "step": 1485 }, { "epoch": 0.7744282744282744, "grad_norm": 0.6141673616193241, "learning_rate": 1.8516748120503217e-05, "loss": 0.6899, "step": 1490 }, { "epoch": 0.777027027027027, "grad_norm": 0.6088321493956566, "learning_rate": 1.8500859982619438e-05, "loss": 0.6985, "step": 1495 }, { "epoch": 0.7796257796257796, "grad_norm": 0.5792987579663321, "learning_rate": 1.848489409509411e-05, "loss": 0.7015, "step": 1500 }, { "epoch": 0.7822245322245323, "grad_norm": 0.5889071004078938, "learning_rate": 1.84688506039527e-05, "loss": 0.6961, "step": 1505 }, { "epoch": 0.7848232848232848, "grad_norm": 0.6583485451018368, "learning_rate": 1.845272965593045e-05, "loss": 0.6999, "step": 1510 }, { "epoch": 0.7874220374220374, "grad_norm": 0.5605926278169279, "learning_rate": 1.843653139847101e-05, "loss": 0.6862, "step": 1515 }, { "epoch": 0.7900207900207901, "grad_norm": 0.5528728709462963, "learning_rate": 1.842025597972513e-05, "loss": 0.697, "step": 1520 }, { "epoch": 0.7926195426195426, "grad_norm": 0.5793992149063935, "learning_rate": 1.840390354854927e-05, "loss": 0.6981, "step": 1525 }, { "epoch": 0.7952182952182952, "grad_norm": 0.5629064758499602, "learning_rate": 1.8387474254504265e-05, "loss": 0.6847, "step": 1530 }, { "epoch": 0.7978170478170478, "grad_norm": 0.5625360669791298, "learning_rate": 1.8370968247853933e-05, "loss": 0.7102, "step": 1535 }, { "epoch": 0.8004158004158004, "grad_norm": 0.575364667753087, "learning_rate": 1.8354385679563723e-05, "loss": 0.7028, "step": 1540 }, { "epoch": 0.803014553014553, "grad_norm": 0.5391664994143878, "learning_rate": 1.8337726701299313e-05, "loss": 0.6972, "step": 1545 }, { "epoch": 0.8056133056133056, "grad_norm": 0.5411008753649549, "learning_rate": 1.8320991465425243e-05, "loss": 0.6903, "step": 1550 }, { "epoch": 0.8082120582120582, "grad_norm": 0.5247464270778599, "learning_rate": 1.8304180125003505e-05, "loss": 0.6892, "step": 1555 }, { "epoch": 0.8108108108108109, "grad_norm": 0.5616645790978936, "learning_rate": 1.8287292833792157e-05, "loss": 0.6996, "step": 1560 }, { "epoch": 0.8134095634095634, "grad_norm": 0.5496955252051037, "learning_rate": 1.8270329746243903e-05, "loss": 0.7093, "step": 1565 }, { "epoch": 0.816008316008316, "grad_norm": 0.5727569676124988, "learning_rate": 1.8253291017504694e-05, "loss": 0.6921, "step": 1570 }, { "epoch": 0.8186070686070686, "grad_norm": 0.5394235138224174, "learning_rate": 1.8236176803412296e-05, "loss": 0.6915, "step": 1575 }, { "epoch": 0.8212058212058212, "grad_norm": 0.5545057246411995, "learning_rate": 1.8218987260494877e-05, "loss": 0.7076, "step": 1580 }, { "epoch": 0.8238045738045738, "grad_norm": 0.5440533432959407, "learning_rate": 1.820172254596956e-05, "loss": 0.6765, "step": 1585 }, { "epoch": 0.8264033264033264, "grad_norm": 0.5572733236733112, "learning_rate": 1.8184382817741005e-05, "loss": 0.699, "step": 1590 }, { "epoch": 0.829002079002079, "grad_norm": 0.5531936584129153, "learning_rate": 1.816696823439995e-05, "loss": 0.6921, "step": 1595 }, { "epoch": 0.8316008316008316, "grad_norm": 0.5885460971318106, "learning_rate": 1.814947895522176e-05, "loss": 0.7058, "step": 1600 }, { "epoch": 0.8341995841995842, "grad_norm": 0.5258234834971192, "learning_rate": 1.8131915140164985e-05, "loss": 0.7075, "step": 1605 }, { "epoch": 0.8367983367983368, "grad_norm": 0.6039050150490132, "learning_rate": 1.8114276949869877e-05, "loss": 0.7022, "step": 1610 }, { "epoch": 0.8393970893970893, "grad_norm": 0.5562997078883312, "learning_rate": 1.809656454565693e-05, "loss": 0.7079, "step": 1615 }, { "epoch": 0.841995841995842, "grad_norm": 0.5537066778477406, "learning_rate": 1.8078778089525423e-05, "loss": 0.6982, "step": 1620 }, { "epoch": 0.8445945945945946, "grad_norm": 0.5336085274714755, "learning_rate": 1.80609177441519e-05, "loss": 0.6813, "step": 1625 }, { "epoch": 0.8471933471933472, "grad_norm": 0.5921356105703777, "learning_rate": 1.8042983672888706e-05, "loss": 0.6982, "step": 1630 }, { "epoch": 0.8497920997920998, "grad_norm": 0.5823716073373996, "learning_rate": 1.8024976039762507e-05, "loss": 0.7007, "step": 1635 }, { "epoch": 0.8523908523908524, "grad_norm": 0.5413045444506639, "learning_rate": 1.8006895009472747e-05, "loss": 0.693, "step": 1640 }, { "epoch": 0.854989604989605, "grad_norm": 0.5634810470157456, "learning_rate": 1.7988740747390182e-05, "loss": 0.6973, "step": 1645 }, { "epoch": 0.8575883575883576, "grad_norm": 0.5742831274552044, "learning_rate": 1.797051341955536e-05, "loss": 0.6983, "step": 1650 }, { "epoch": 0.8601871101871101, "grad_norm": 0.5870076478023661, "learning_rate": 1.7952213192677074e-05, "loss": 0.7139, "step": 1655 }, { "epoch": 0.8627858627858628, "grad_norm": 0.5730171778092863, "learning_rate": 1.7933840234130878e-05, "loss": 0.7048, "step": 1660 }, { "epoch": 0.8653846153846154, "grad_norm": 0.5104051114581488, "learning_rate": 1.7915394711957523e-05, "loss": 0.7073, "step": 1665 }, { "epoch": 0.867983367983368, "grad_norm": 0.5580415382036159, "learning_rate": 1.7896876794861443e-05, "loss": 0.6942, "step": 1670 }, { "epoch": 0.8705821205821206, "grad_norm": 0.5837643043381491, "learning_rate": 1.7878286652209196e-05, "loss": 0.7025, "step": 1675 }, { "epoch": 0.8731808731808732, "grad_norm": 0.5423350178745967, "learning_rate": 1.785962445402792e-05, "loss": 0.6952, "step": 1680 }, { "epoch": 0.8757796257796258, "grad_norm": 0.5729568854084454, "learning_rate": 1.7840890371003795e-05, "loss": 0.6966, "step": 1685 }, { "epoch": 0.8783783783783784, "grad_norm": 0.5553594551886265, "learning_rate": 1.782208457448044e-05, "loss": 0.7013, "step": 1690 }, { "epoch": 0.8809771309771309, "grad_norm": 0.535467096093925, "learning_rate": 1.7803207236457404e-05, "loss": 0.7082, "step": 1695 }, { "epoch": 0.8835758835758836, "grad_norm": 0.5486284072585131, "learning_rate": 1.778425852958853e-05, "loss": 0.6666, "step": 1700 }, { "epoch": 0.8861746361746362, "grad_norm": 0.5078876333931026, "learning_rate": 1.7765238627180424e-05, "loss": 0.6894, "step": 1705 }, { "epoch": 0.8887733887733887, "grad_norm": 0.5667869632736022, "learning_rate": 1.7746147703190857e-05, "loss": 0.704, "step": 1710 }, { "epoch": 0.8913721413721414, "grad_norm": 0.574959887360108, "learning_rate": 1.7726985932227156e-05, "loss": 0.7107, "step": 1715 }, { "epoch": 0.893970893970894, "grad_norm": 0.5224789850325783, "learning_rate": 1.7707753489544628e-05, "loss": 0.7047, "step": 1720 }, { "epoch": 0.8965696465696466, "grad_norm": 0.5527415568002146, "learning_rate": 1.768845055104495e-05, "loss": 0.7091, "step": 1725 }, { "epoch": 0.8991683991683992, "grad_norm": 0.5812394569236012, "learning_rate": 1.7669077293274564e-05, "loss": 0.6862, "step": 1730 }, { "epoch": 0.9017671517671517, "grad_norm": 0.5235725170689791, "learning_rate": 1.764963389342305e-05, "loss": 0.6811, "step": 1735 }, { "epoch": 0.9043659043659044, "grad_norm": 0.5171430012007782, "learning_rate": 1.7630120529321518e-05, "loss": 0.6846, "step": 1740 }, { "epoch": 0.906964656964657, "grad_norm": 0.528693038590328, "learning_rate": 1.7610537379440987e-05, "loss": 0.6915, "step": 1745 }, { "epoch": 0.9095634095634095, "grad_norm": 0.5364540922380395, "learning_rate": 1.759088462289072e-05, "loss": 0.684, "step": 1750 }, { "epoch": 0.9121621621621622, "grad_norm": 0.5660558568326042, "learning_rate": 1.7571162439416632e-05, "loss": 0.6955, "step": 1755 }, { "epoch": 0.9147609147609148, "grad_norm": 0.5672957617937873, "learning_rate": 1.755137100939961e-05, "loss": 0.6988, "step": 1760 }, { "epoch": 0.9173596673596673, "grad_norm": 0.5530873304373302, "learning_rate": 1.753151051385388e-05, "loss": 0.6833, "step": 1765 }, { "epoch": 0.91995841995842, "grad_norm": 0.5178111657664748, "learning_rate": 1.7511581134425347e-05, "loss": 0.7073, "step": 1770 }, { "epoch": 0.9225571725571725, "grad_norm": 0.5721288757020301, "learning_rate": 1.7491583053389937e-05, "loss": 0.6863, "step": 1775 }, { "epoch": 0.9251559251559252, "grad_norm": 0.5726074152322754, "learning_rate": 1.7471516453651925e-05, "loss": 0.6885, "step": 1780 }, { "epoch": 0.9277546777546778, "grad_norm": 0.5569113799035454, "learning_rate": 1.7451381518742264e-05, "loss": 0.6919, "step": 1785 }, { "epoch": 0.9303534303534303, "grad_norm": 0.5349940876609687, "learning_rate": 1.7431178432816905e-05, "loss": 0.6888, "step": 1790 }, { "epoch": 0.932952182952183, "grad_norm": 0.48085486147721074, "learning_rate": 1.7410907380655118e-05, "loss": 0.6892, "step": 1795 }, { "epoch": 0.9355509355509356, "grad_norm": 0.5162470850450532, "learning_rate": 1.7390568547657797e-05, "loss": 0.6844, "step": 1800 }, { "epoch": 0.9381496881496881, "grad_norm": 0.5500432932817269, "learning_rate": 1.7370162119845768e-05, "loss": 0.677, "step": 1805 }, { "epoch": 0.9407484407484408, "grad_norm": 0.5691270831237378, "learning_rate": 1.734968828385808e-05, "loss": 0.6816, "step": 1810 }, { "epoch": 0.9433471933471933, "grad_norm": 0.5353076421264558, "learning_rate": 1.7329147226950303e-05, "loss": 0.6825, "step": 1815 }, { "epoch": 0.9459459459459459, "grad_norm": 0.552477154180168, "learning_rate": 1.7308539136992823e-05, "loss": 0.6893, "step": 1820 }, { "epoch": 0.9485446985446986, "grad_norm": 0.5280777987730796, "learning_rate": 1.7287864202469117e-05, "loss": 0.7004, "step": 1825 }, { "epoch": 0.9511434511434511, "grad_norm": 0.5437828698378319, "learning_rate": 1.7267122612474013e-05, "loss": 0.6761, "step": 1830 }, { "epoch": 0.9537422037422038, "grad_norm": 0.5687279165024458, "learning_rate": 1.7246314556711994e-05, "loss": 0.6894, "step": 1835 }, { "epoch": 0.9563409563409564, "grad_norm": 0.5740312633264971, "learning_rate": 1.7225440225495436e-05, "loss": 0.6914, "step": 1840 }, { "epoch": 0.9589397089397089, "grad_norm": 0.5573795518397149, "learning_rate": 1.720449980974288e-05, "loss": 0.6771, "step": 1845 }, { "epoch": 0.9615384615384616, "grad_norm": 0.5351408449090207, "learning_rate": 1.7183493500977277e-05, "loss": 0.6932, "step": 1850 }, { "epoch": 0.9641372141372141, "grad_norm": 0.5528674527887268, "learning_rate": 1.7162421491324247e-05, "loss": 0.6836, "step": 1855 }, { "epoch": 0.9667359667359667, "grad_norm": 0.5406423387911308, "learning_rate": 1.7141283973510313e-05, "loss": 0.691, "step": 1860 }, { "epoch": 0.9693347193347194, "grad_norm": 0.5420681920741066, "learning_rate": 1.712008114086115e-05, "loss": 0.7039, "step": 1865 }, { "epoch": 0.9719334719334719, "grad_norm": 0.496824504759365, "learning_rate": 1.7098813187299786e-05, "loss": 0.692, "step": 1870 }, { "epoch": 0.9745322245322245, "grad_norm": 0.5324381566943999, "learning_rate": 1.707748030734488e-05, "loss": 0.6776, "step": 1875 }, { "epoch": 0.9771309771309772, "grad_norm": 0.5230259681581492, "learning_rate": 1.7056082696108896e-05, "loss": 0.6847, "step": 1880 }, { "epoch": 0.9797297297297297, "grad_norm": 0.5404603350045258, "learning_rate": 1.7034620549296336e-05, "loss": 0.6896, "step": 1885 }, { "epoch": 0.9823284823284824, "grad_norm": 0.5198497643717813, "learning_rate": 1.701309406320196e-05, "loss": 0.6676, "step": 1890 }, { "epoch": 0.9849272349272349, "grad_norm": 0.49415192104030464, "learning_rate": 1.699150343470897e-05, "loss": 0.6839, "step": 1895 }, { "epoch": 0.9875259875259875, "grad_norm": 0.5044834255400932, "learning_rate": 1.696984886128723e-05, "loss": 0.6913, "step": 1900 }, { "epoch": 0.9901247401247402, "grad_norm": 0.5398617776602235, "learning_rate": 1.6948130540991443e-05, "loss": 0.6874, "step": 1905 }, { "epoch": 0.9927234927234927, "grad_norm": 0.5225589262440207, "learning_rate": 1.6926348672459347e-05, "loss": 0.6822, "step": 1910 }, { "epoch": 0.9953222453222453, "grad_norm": 0.5351932477818484, "learning_rate": 1.6904503454909905e-05, "loss": 0.668, "step": 1915 }, { "epoch": 0.997920997920998, "grad_norm": 0.5596299212706576, "learning_rate": 1.688259508814147e-05, "loss": 0.6884, "step": 1920 }, { "epoch": 1.0, "eval_loss": 0.7763931155204773, "eval_runtime": 106.5617, "eval_samples_per_second": 77.045, "eval_steps_per_second": 1.211, "step": 1924 }, { "epoch": 1.0005197505197505, "grad_norm": 0.7015242091925799, "learning_rate": 1.6860623772529964e-05, "loss": 0.6682, "step": 1925 }, { "epoch": 1.003118503118503, "grad_norm": 0.6034586283492043, "learning_rate": 1.6838589709027043e-05, "loss": 0.6255, "step": 1930 }, { "epoch": 1.0057172557172558, "grad_norm": 0.6673777384785803, "learning_rate": 1.681649309915827e-05, "loss": 0.615, "step": 1935 }, { "epoch": 1.0083160083160083, "grad_norm": 0.55969047689154, "learning_rate": 1.6794334145021252e-05, "loss": 0.6276, "step": 1940 }, { "epoch": 1.0109147609147608, "grad_norm": 0.5655945464921515, "learning_rate": 1.677211304928381e-05, "loss": 0.6072, "step": 1945 }, { "epoch": 1.0135135135135136, "grad_norm": 0.5544352654106766, "learning_rate": 1.6749830015182106e-05, "loss": 0.604, "step": 1950 }, { "epoch": 1.0161122661122661, "grad_norm": 0.5513580613594744, "learning_rate": 1.6727485246518813e-05, "loss": 0.6087, "step": 1955 }, { "epoch": 1.0187110187110187, "grad_norm": 0.5475994898582014, "learning_rate": 1.6705078947661224e-05, "loss": 0.6125, "step": 1960 }, { "epoch": 1.0213097713097714, "grad_norm": 0.5469347962338588, "learning_rate": 1.668261132353939e-05, "loss": 0.6079, "step": 1965 }, { "epoch": 1.023908523908524, "grad_norm": 0.6609443347963427, "learning_rate": 1.6660082579644257e-05, "loss": 0.6085, "step": 1970 }, { "epoch": 1.0265072765072765, "grad_norm": 0.5735256704279655, "learning_rate": 1.6637492922025767e-05, "loss": 0.5988, "step": 1975 }, { "epoch": 1.0291060291060292, "grad_norm": 0.5604570622446723, "learning_rate": 1.6614842557291003e-05, "loss": 0.605, "step": 1980 }, { "epoch": 1.0317047817047817, "grad_norm": 0.5716162653407055, "learning_rate": 1.6592131692602257e-05, "loss": 0.6199, "step": 1985 }, { "epoch": 1.0343035343035343, "grad_norm": 0.572255735683218, "learning_rate": 1.6569360535675177e-05, "loss": 0.6136, "step": 1990 }, { "epoch": 1.0369022869022868, "grad_norm": 0.5548434711803698, "learning_rate": 1.654652929477684e-05, "loss": 0.6292, "step": 1995 }, { "epoch": 1.0395010395010396, "grad_norm": 0.5372326277134161, "learning_rate": 1.6523638178723863e-05, "loss": 0.615, "step": 2000 }, { "epoch": 1.042099792099792, "grad_norm": 0.5629251525957598, "learning_rate": 1.6500687396880483e-05, "loss": 0.5994, "step": 2005 }, { "epoch": 1.0446985446985446, "grad_norm": 0.5386319104306212, "learning_rate": 1.6477677159156647e-05, "loss": 0.6074, "step": 2010 }, { "epoch": 1.0472972972972974, "grad_norm": 0.5344519142234625, "learning_rate": 1.6454607676006085e-05, "loss": 0.6093, "step": 2015 }, { "epoch": 1.04989604989605, "grad_norm": 0.5911538942849666, "learning_rate": 1.64314791584244e-05, "loss": 0.6219, "step": 2020 }, { "epoch": 1.0524948024948024, "grad_norm": 0.5747632841380809, "learning_rate": 1.6408291817947126e-05, "loss": 0.6229, "step": 2025 }, { "epoch": 1.0550935550935552, "grad_norm": 0.5553970996601802, "learning_rate": 1.6385045866647797e-05, "loss": 0.6131, "step": 2030 }, { "epoch": 1.0576923076923077, "grad_norm": 0.54652804470796, "learning_rate": 1.6361741517136e-05, "loss": 0.6189, "step": 2035 }, { "epoch": 1.0602910602910602, "grad_norm": 0.5652320668164962, "learning_rate": 1.633837898255545e-05, "loss": 0.6206, "step": 2040 }, { "epoch": 1.062889812889813, "grad_norm": 0.5377492795503913, "learning_rate": 1.631495847658202e-05, "loss": 0.6246, "step": 2045 }, { "epoch": 1.0654885654885655, "grad_norm": 0.5701190357161201, "learning_rate": 1.6291480213421796e-05, "loss": 0.6151, "step": 2050 }, { "epoch": 1.068087318087318, "grad_norm": 0.5661751524816903, "learning_rate": 1.626794440780911e-05, "loss": 0.6155, "step": 2055 }, { "epoch": 1.0706860706860706, "grad_norm": 0.563344281114797, "learning_rate": 1.62443512750046e-05, "loss": 0.6065, "step": 2060 }, { "epoch": 1.0732848232848233, "grad_norm": 0.5605142234460203, "learning_rate": 1.6220701030793203e-05, "loss": 0.6182, "step": 2065 }, { "epoch": 1.0758835758835759, "grad_norm": 0.5276428593516702, "learning_rate": 1.6196993891482216e-05, "loss": 0.622, "step": 2070 }, { "epoch": 1.0784823284823284, "grad_norm": 0.5855580679475535, "learning_rate": 1.6173230073899303e-05, "loss": 0.613, "step": 2075 }, { "epoch": 1.0810810810810811, "grad_norm": 0.5464688542980072, "learning_rate": 1.6149409795390503e-05, "loss": 0.6109, "step": 2080 }, { "epoch": 1.0836798336798337, "grad_norm": 0.523861090524199, "learning_rate": 1.6125533273818257e-05, "loss": 0.5932, "step": 2085 }, { "epoch": 1.0862785862785862, "grad_norm": 0.5800436769814354, "learning_rate": 1.6101600727559423e-05, "loss": 0.5974, "step": 2090 }, { "epoch": 1.088877338877339, "grad_norm": 0.5314763619677401, "learning_rate": 1.6077612375503244e-05, "loss": 0.6233, "step": 2095 }, { "epoch": 1.0914760914760915, "grad_norm": 0.5292466384443512, "learning_rate": 1.605356843704938e-05, "loss": 0.6082, "step": 2100 }, { "epoch": 1.094074844074844, "grad_norm": 0.5877188139637917, "learning_rate": 1.6029469132105886e-05, "loss": 0.6255, "step": 2105 }, { "epoch": 1.0966735966735968, "grad_norm": 0.5760369419098388, "learning_rate": 1.6005314681087208e-05, "loss": 0.6157, "step": 2110 }, { "epoch": 1.0992723492723493, "grad_norm": 0.630752472432159, "learning_rate": 1.598110530491216e-05, "loss": 0.6175, "step": 2115 }, { "epoch": 1.1018711018711018, "grad_norm": 0.5608953698596442, "learning_rate": 1.595684122500191e-05, "loss": 0.6177, "step": 2120 }, { "epoch": 1.1044698544698546, "grad_norm": 0.573695393031942, "learning_rate": 1.593252266327794e-05, "loss": 0.6243, "step": 2125 }, { "epoch": 1.107068607068607, "grad_norm": 0.5935701794951512, "learning_rate": 1.590814984216004e-05, "loss": 0.6134, "step": 2130 }, { "epoch": 1.1096673596673596, "grad_norm": 0.5895903291761935, "learning_rate": 1.588372298456426e-05, "loss": 0.6082, "step": 2135 }, { "epoch": 1.1122661122661124, "grad_norm": 0.5711362021969438, "learning_rate": 1.5859242313900866e-05, "loss": 0.6048, "step": 2140 }, { "epoch": 1.114864864864865, "grad_norm": 0.5761578602169135, "learning_rate": 1.583470805407231e-05, "loss": 0.619, "step": 2145 }, { "epoch": 1.1174636174636174, "grad_norm": 0.5567866303525553, "learning_rate": 1.581012042947117e-05, "loss": 0.6112, "step": 2150 }, { "epoch": 1.12006237006237, "grad_norm": 0.5493991058746482, "learning_rate": 1.578547966497811e-05, "loss": 0.5976, "step": 2155 }, { "epoch": 1.1226611226611227, "grad_norm": 0.5652320554508646, "learning_rate": 1.57607859859598e-05, "loss": 0.6048, "step": 2160 }, { "epoch": 1.1252598752598753, "grad_norm": 0.5401641304994612, "learning_rate": 1.57360396182669e-05, "loss": 0.6082, "step": 2165 }, { "epoch": 1.1278586278586278, "grad_norm": 0.5606818865719918, "learning_rate": 1.5711240788231933e-05, "loss": 0.6039, "step": 2170 }, { "epoch": 1.1304573804573805, "grad_norm": 0.6007383546804671, "learning_rate": 1.5686389722667273e-05, "loss": 0.6047, "step": 2175 }, { "epoch": 1.133056133056133, "grad_norm": 0.5715756539794042, "learning_rate": 1.5661486648863027e-05, "loss": 0.6252, "step": 2180 }, { "epoch": 1.1356548856548856, "grad_norm": 0.6079845247405427, "learning_rate": 1.563653179458499e-05, "loss": 0.6099, "step": 2185 }, { "epoch": 1.1382536382536383, "grad_norm": 0.5663226785265596, "learning_rate": 1.5611525388072525e-05, "loss": 0.5996, "step": 2190 }, { "epoch": 1.1408523908523909, "grad_norm": 0.6108898947357355, "learning_rate": 1.5586467658036526e-05, "loss": 0.6209, "step": 2195 }, { "epoch": 1.1434511434511434, "grad_norm": 0.6420427561575582, "learning_rate": 1.556135883365727e-05, "loss": 0.6038, "step": 2200 }, { "epoch": 1.1460498960498962, "grad_norm": 0.587335250663389, "learning_rate": 1.5536199144582354e-05, "loss": 0.6242, "step": 2205 }, { "epoch": 1.1486486486486487, "grad_norm": 0.5910496137391441, "learning_rate": 1.5510988820924598e-05, "loss": 0.6069, "step": 2210 }, { "epoch": 1.1512474012474012, "grad_norm": 0.5655552313228328, "learning_rate": 1.5485728093259923e-05, "loss": 0.6225, "step": 2215 }, { "epoch": 1.1538461538461537, "grad_norm": 0.554660591831712, "learning_rate": 1.5460417192625245e-05, "loss": 0.6121, "step": 2220 }, { "epoch": 1.1564449064449065, "grad_norm": 0.5924033501687683, "learning_rate": 1.5435056350516376e-05, "loss": 0.6108, "step": 2225 }, { "epoch": 1.159043659043659, "grad_norm": 0.5678195976691061, "learning_rate": 1.54096457988859e-05, "loss": 0.6146, "step": 2230 }, { "epoch": 1.1616424116424116, "grad_norm": 0.582762073846593, "learning_rate": 1.5384185770141027e-05, "loss": 0.6116, "step": 2235 }, { "epoch": 1.1642411642411643, "grad_norm": 0.5520839459854381, "learning_rate": 1.535867649714152e-05, "loss": 0.6167, "step": 2240 }, { "epoch": 1.1668399168399168, "grad_norm": 0.5394988505298011, "learning_rate": 1.533311821319751e-05, "loss": 0.6173, "step": 2245 }, { "epoch": 1.1694386694386694, "grad_norm": 0.5490162258104867, "learning_rate": 1.5307511152067397e-05, "loss": 0.6195, "step": 2250 }, { "epoch": 1.1720374220374221, "grad_norm": 0.5176946937084966, "learning_rate": 1.5281855547955704e-05, "loss": 0.6063, "step": 2255 }, { "epoch": 1.1746361746361746, "grad_norm": 0.5697232320984311, "learning_rate": 1.5256151635510925e-05, "loss": 0.6132, "step": 2260 }, { "epoch": 1.1772349272349272, "grad_norm": 0.5408355403813135, "learning_rate": 1.5230399649823389e-05, "loss": 0.6202, "step": 2265 }, { "epoch": 1.17983367983368, "grad_norm": 0.5504776040838202, "learning_rate": 1.5204599826423108e-05, "loss": 0.6121, "step": 2270 }, { "epoch": 1.1824324324324325, "grad_norm": 0.5337013368651256, "learning_rate": 1.5178752401277628e-05, "loss": 0.616, "step": 2275 }, { "epoch": 1.185031185031185, "grad_norm": 0.562149132935065, "learning_rate": 1.5152857610789854e-05, "loss": 0.6097, "step": 2280 }, { "epoch": 1.1876299376299375, "grad_norm": 0.5909197735161369, "learning_rate": 1.5126915691795905e-05, "loss": 0.6188, "step": 2285 }, { "epoch": 1.1902286902286903, "grad_norm": 0.5535938243322149, "learning_rate": 1.5100926881562936e-05, "loss": 0.6137, "step": 2290 }, { "epoch": 1.1928274428274428, "grad_norm": 0.544767406909682, "learning_rate": 1.5074891417786993e-05, "loss": 0.6133, "step": 2295 }, { "epoch": 1.1954261954261955, "grad_norm": 0.5459850942463099, "learning_rate": 1.5048809538590789e-05, "loss": 0.613, "step": 2300 }, { "epoch": 1.198024948024948, "grad_norm": 0.5873358493955128, "learning_rate": 1.5022681482521579e-05, "loss": 0.6156, "step": 2305 }, { "epoch": 1.2006237006237006, "grad_norm": 0.5644324461104552, "learning_rate": 1.499650748854895e-05, "loss": 0.6155, "step": 2310 }, { "epoch": 1.2032224532224531, "grad_norm": 0.5531535214490884, "learning_rate": 1.4970287796062642e-05, "loss": 0.6191, "step": 2315 }, { "epoch": 1.2058212058212059, "grad_norm": 0.5509179294326446, "learning_rate": 1.494402264487035e-05, "loss": 0.614, "step": 2320 }, { "epoch": 1.2084199584199584, "grad_norm": 0.5585470168515849, "learning_rate": 1.491771227519555e-05, "loss": 0.6139, "step": 2325 }, { "epoch": 1.211018711018711, "grad_norm": 0.5129593419686834, "learning_rate": 1.4891356927675284e-05, "loss": 0.6089, "step": 2330 }, { "epoch": 1.2136174636174637, "grad_norm": 0.5920443075253277, "learning_rate": 1.4864956843357967e-05, "loss": 0.63, "step": 2335 }, { "epoch": 1.2162162162162162, "grad_norm": 0.5559902991412571, "learning_rate": 1.4838512263701184e-05, "loss": 0.6228, "step": 2340 }, { "epoch": 1.2188149688149688, "grad_norm": 0.5643995055948857, "learning_rate": 1.4812023430569467e-05, "loss": 0.619, "step": 2345 }, { "epoch": 1.2214137214137215, "grad_norm": 0.5742853786867631, "learning_rate": 1.4785490586232108e-05, "loss": 0.6245, "step": 2350 }, { "epoch": 1.224012474012474, "grad_norm": 0.5778953782438334, "learning_rate": 1.4758913973360919e-05, "loss": 0.6227, "step": 2355 }, { "epoch": 1.2266112266112266, "grad_norm": 0.5925914426786582, "learning_rate": 1.4732293835028038e-05, "loss": 0.6107, "step": 2360 }, { "epoch": 1.2292099792099793, "grad_norm": 0.5895371651072315, "learning_rate": 1.4705630414703669e-05, "loss": 0.6057, "step": 2365 }, { "epoch": 1.2318087318087318, "grad_norm": 0.6081772444953167, "learning_rate": 1.4678923956253894e-05, "loss": 0.6424, "step": 2370 }, { "epoch": 1.2344074844074844, "grad_norm": 0.5933961879145944, "learning_rate": 1.4652174703938422e-05, "loss": 0.6128, "step": 2375 }, { "epoch": 1.237006237006237, "grad_norm": 0.6054620771138413, "learning_rate": 1.4625382902408356e-05, "loss": 0.6084, "step": 2380 }, { "epoch": 1.2396049896049897, "grad_norm": 0.5776932281070712, "learning_rate": 1.4598548796703953e-05, "loss": 0.6217, "step": 2385 }, { "epoch": 1.2422037422037422, "grad_norm": 0.5591153237371339, "learning_rate": 1.4571672632252404e-05, "loss": 0.6059, "step": 2390 }, { "epoch": 1.2448024948024947, "grad_norm": 0.5667751253010028, "learning_rate": 1.4544754654865553e-05, "loss": 0.6269, "step": 2395 }, { "epoch": 1.2474012474012475, "grad_norm": 0.5510576618147843, "learning_rate": 1.4517795110737687e-05, "loss": 0.6175, "step": 2400 }, { "epoch": 1.25, "grad_norm": 0.5653685584114336, "learning_rate": 1.4490794246443249e-05, "loss": 0.6141, "step": 2405 }, { "epoch": 1.2525987525987525, "grad_norm": 0.569054506821339, "learning_rate": 1.446375230893462e-05, "loss": 0.6132, "step": 2410 }, { "epoch": 1.255197505197505, "grad_norm": 0.5530850077073164, "learning_rate": 1.4436669545539824e-05, "loss": 0.6112, "step": 2415 }, { "epoch": 1.2577962577962578, "grad_norm": 0.5413151446394687, "learning_rate": 1.4409546203960284e-05, "loss": 0.6032, "step": 2420 }, { "epoch": 1.2603950103950103, "grad_norm": 0.5230951552758679, "learning_rate": 1.4382382532268566e-05, "loss": 0.6144, "step": 2425 }, { "epoch": 1.262993762993763, "grad_norm": 0.541771918919958, "learning_rate": 1.4355178778906085e-05, "loss": 0.6234, "step": 2430 }, { "epoch": 1.2655925155925156, "grad_norm": 0.5203001197628181, "learning_rate": 1.4327935192680857e-05, "loss": 0.6045, "step": 2435 }, { "epoch": 1.2681912681912682, "grad_norm": 0.5440655504089812, "learning_rate": 1.4300652022765207e-05, "loss": 0.6139, "step": 2440 }, { "epoch": 1.2707900207900207, "grad_norm": 0.6149133483770466, "learning_rate": 1.4273329518693497e-05, "loss": 0.6145, "step": 2445 }, { "epoch": 1.2733887733887734, "grad_norm": 0.6021509402407774, "learning_rate": 1.4245967930359848e-05, "loss": 0.6159, "step": 2450 }, { "epoch": 1.275987525987526, "grad_norm": 0.5913158357105107, "learning_rate": 1.4218567508015841e-05, "loss": 0.6168, "step": 2455 }, { "epoch": 1.2785862785862787, "grad_norm": 0.5618432626028342, "learning_rate": 1.4191128502268242e-05, "loss": 0.6152, "step": 2460 }, { "epoch": 1.2811850311850312, "grad_norm": 0.5249984782845095, "learning_rate": 1.4163651164076705e-05, "loss": 0.6086, "step": 2465 }, { "epoch": 1.2837837837837838, "grad_norm": 0.5672830278319703, "learning_rate": 1.4136135744751468e-05, "loss": 0.6114, "step": 2470 }, { "epoch": 1.2863825363825363, "grad_norm": 0.5682891875562709, "learning_rate": 1.4108582495951077e-05, "loss": 0.6148, "step": 2475 }, { "epoch": 1.288981288981289, "grad_norm": 0.5615341097983116, "learning_rate": 1.408099166968005e-05, "loss": 0.6111, "step": 2480 }, { "epoch": 1.2915800415800416, "grad_norm": 0.5497563938968811, "learning_rate": 1.4053363518286613e-05, "loss": 0.6088, "step": 2485 }, { "epoch": 1.2941787941787941, "grad_norm": 0.5582405570031684, "learning_rate": 1.4025698294460362e-05, "loss": 0.6136, "step": 2490 }, { "epoch": 1.2967775467775469, "grad_norm": 0.6011380118880273, "learning_rate": 1.3997996251229948e-05, "loss": 0.6186, "step": 2495 }, { "epoch": 1.2993762993762994, "grad_norm": 0.5496562610843831, "learning_rate": 1.3970257641960795e-05, "loss": 0.6182, "step": 2500 }, { "epoch": 1.301975051975052, "grad_norm": 0.5687796275549053, "learning_rate": 1.3942482720352761e-05, "loss": 0.6157, "step": 2505 }, { "epoch": 1.3045738045738045, "grad_norm": 0.574298920577317, "learning_rate": 1.3914671740437811e-05, "loss": 0.6136, "step": 2510 }, { "epoch": 1.3071725571725572, "grad_norm": 0.5542768495449328, "learning_rate": 1.3886824956577702e-05, "loss": 0.6031, "step": 2515 }, { "epoch": 1.3097713097713097, "grad_norm": 0.5666521327715712, "learning_rate": 1.3858942623461664e-05, "loss": 0.6062, "step": 2520 }, { "epoch": 1.3123700623700625, "grad_norm": 0.5383202751991224, "learning_rate": 1.3831024996104065e-05, "loss": 0.6119, "step": 2525 }, { "epoch": 1.314968814968815, "grad_norm": 0.550924324768737, "learning_rate": 1.3803072329842073e-05, "loss": 0.6218, "step": 2530 }, { "epoch": 1.3175675675675675, "grad_norm": 0.5715325257279636, "learning_rate": 1.3775084880333323e-05, "loss": 0.6197, "step": 2535 }, { "epoch": 1.32016632016632, "grad_norm": 0.5516314324953223, "learning_rate": 1.3747062903553582e-05, "loss": 0.5983, "step": 2540 }, { "epoch": 1.3227650727650728, "grad_norm": 0.5587681122677882, "learning_rate": 1.3719006655794414e-05, "loss": 0.6104, "step": 2545 }, { "epoch": 1.3253638253638254, "grad_norm": 0.5529619265877077, "learning_rate": 1.3690916393660815e-05, "loss": 0.6232, "step": 2550 }, { "epoch": 1.3279625779625779, "grad_norm": 0.6007892832321496, "learning_rate": 1.3662792374068896e-05, "loss": 0.6246, "step": 2555 }, { "epoch": 1.3305613305613306, "grad_norm": 0.5102078314524738, "learning_rate": 1.3634634854243503e-05, "loss": 0.6037, "step": 2560 }, { "epoch": 1.3331600831600832, "grad_norm": 0.5068981925325898, "learning_rate": 1.3606444091715883e-05, "loss": 0.6056, "step": 2565 }, { "epoch": 1.3357588357588357, "grad_norm": 0.5201200155890484, "learning_rate": 1.3578220344321325e-05, "loss": 0.6088, "step": 2570 }, { "epoch": 1.3383575883575882, "grad_norm": 0.5411417638449072, "learning_rate": 1.3549963870196796e-05, "loss": 0.606, "step": 2575 }, { "epoch": 1.340956340956341, "grad_norm": 0.5169808096315553, "learning_rate": 1.3521674927778594e-05, "loss": 0.6278, "step": 2580 }, { "epoch": 1.3435550935550935, "grad_norm": 0.5658934679962141, "learning_rate": 1.3493353775799967e-05, "loss": 0.6067, "step": 2585 }, { "epoch": 1.3461538461538463, "grad_norm": 0.5724238241800808, "learning_rate": 1.3465000673288757e-05, "loss": 0.6003, "step": 2590 }, { "epoch": 1.3487525987525988, "grad_norm": 0.6105368545978801, "learning_rate": 1.3436615879565025e-05, "loss": 0.616, "step": 2595 }, { "epoch": 1.3513513513513513, "grad_norm": 0.5188576936304327, "learning_rate": 1.340819965423869e-05, "loss": 0.6283, "step": 2600 }, { "epoch": 1.3539501039501038, "grad_norm": 0.4959836182939828, "learning_rate": 1.3379752257207144e-05, "loss": 0.6157, "step": 2605 }, { "epoch": 1.3565488565488566, "grad_norm": 0.5769448388897034, "learning_rate": 1.3351273948652872e-05, "loss": 0.6133, "step": 2610 }, { "epoch": 1.3591476091476091, "grad_norm": 0.5647777721810548, "learning_rate": 1.3322764989041086e-05, "loss": 0.6047, "step": 2615 }, { "epoch": 1.3617463617463619, "grad_norm": 0.5362269489941972, "learning_rate": 1.329422563911734e-05, "loss": 0.6244, "step": 2620 }, { "epoch": 1.3643451143451144, "grad_norm": 0.5876277649004987, "learning_rate": 1.326565615990513e-05, "loss": 0.6094, "step": 2625 }, { "epoch": 1.366943866943867, "grad_norm": 0.5771702605216373, "learning_rate": 1.3237056812703517e-05, "loss": 0.6162, "step": 2630 }, { "epoch": 1.3695426195426195, "grad_norm": 0.5206111176210121, "learning_rate": 1.3208427859084743e-05, "loss": 0.5991, "step": 2635 }, { "epoch": 1.3721413721413722, "grad_norm": 0.5703420517094763, "learning_rate": 1.3179769560891837e-05, "loss": 0.6158, "step": 2640 }, { "epoch": 1.3747401247401247, "grad_norm": 0.5075630462180919, "learning_rate": 1.315108218023621e-05, "loss": 0.6157, "step": 2645 }, { "epoch": 1.3773388773388773, "grad_norm": 0.5278204198500884, "learning_rate": 1.3122365979495259e-05, "loss": 0.611, "step": 2650 }, { "epoch": 1.37993762993763, "grad_norm": 0.5830494022632724, "learning_rate": 1.3093621221309982e-05, "loss": 0.6226, "step": 2655 }, { "epoch": 1.3825363825363826, "grad_norm": 0.5567019594449695, "learning_rate": 1.3064848168582562e-05, "loss": 0.6128, "step": 2660 }, { "epoch": 1.385135135135135, "grad_norm": 0.5218600131647313, "learning_rate": 1.3036047084473964e-05, "loss": 0.6164, "step": 2665 }, { "epoch": 1.3877338877338876, "grad_norm": 0.5550941890937359, "learning_rate": 1.3007218232401535e-05, "loss": 0.6178, "step": 2670 }, { "epoch": 1.3903326403326404, "grad_norm": 0.5140778619937807, "learning_rate": 1.2978361876036586e-05, "loss": 0.6015, "step": 2675 }, { "epoch": 1.392931392931393, "grad_norm": 0.5704426484745836, "learning_rate": 1.2949478279301993e-05, "loss": 0.6218, "step": 2680 }, { "epoch": 1.3955301455301456, "grad_norm": 0.573333768381573, "learning_rate": 1.292056770636976e-05, "loss": 0.6195, "step": 2685 }, { "epoch": 1.3981288981288982, "grad_norm": 0.5463535484803559, "learning_rate": 1.2891630421658631e-05, "loss": 0.619, "step": 2690 }, { "epoch": 1.4007276507276507, "grad_norm": 0.5239768140578435, "learning_rate": 1.2862666689831655e-05, "loss": 0.5988, "step": 2695 }, { "epoch": 1.4033264033264032, "grad_norm": 0.5254212957357791, "learning_rate": 1.2833676775793766e-05, "loss": 0.6089, "step": 2700 }, { "epoch": 1.405925155925156, "grad_norm": 0.4999540179579075, "learning_rate": 1.2804660944689368e-05, "loss": 0.6161, "step": 2705 }, { "epoch": 1.4085239085239085, "grad_norm": 0.5566115132096349, "learning_rate": 1.2775619461899896e-05, "loss": 0.6182, "step": 2710 }, { "epoch": 1.411122661122661, "grad_norm": 0.5740812598543206, "learning_rate": 1.2746552593041405e-05, "loss": 0.598, "step": 2715 }, { "epoch": 1.4137214137214138, "grad_norm": 0.5437551314682787, "learning_rate": 1.2717460603962132e-05, "loss": 0.609, "step": 2720 }, { "epoch": 1.4163201663201663, "grad_norm": 0.527003171395807, "learning_rate": 1.268834376074007e-05, "loss": 0.6097, "step": 2725 }, { "epoch": 1.4189189189189189, "grad_norm": 0.5151895053958203, "learning_rate": 1.2659202329680515e-05, "loss": 0.6223, "step": 2730 }, { "epoch": 1.4215176715176714, "grad_norm": 0.5220435941255479, "learning_rate": 1.2630036577313667e-05, "loss": 0.6273, "step": 2735 }, { "epoch": 1.4241164241164241, "grad_norm": 0.5541408035311566, "learning_rate": 1.2600846770392155e-05, "loss": 0.6115, "step": 2740 }, { "epoch": 1.4267151767151767, "grad_norm": 0.5183266262374772, "learning_rate": 1.2571633175888618e-05, "loss": 0.6098, "step": 2745 }, { "epoch": 1.4293139293139294, "grad_norm": 0.5467945168613629, "learning_rate": 1.2542396060993256e-05, "loss": 0.6129, "step": 2750 }, { "epoch": 1.431912681912682, "grad_norm": 0.5408402850999704, "learning_rate": 1.2513135693111399e-05, "loss": 0.6113, "step": 2755 }, { "epoch": 1.4345114345114345, "grad_norm": 0.5481669387572653, "learning_rate": 1.2483852339861033e-05, "loss": 0.6032, "step": 2760 }, { "epoch": 1.437110187110187, "grad_norm": 0.5292679934908046, "learning_rate": 1.2454546269070392e-05, "loss": 0.6037, "step": 2765 }, { "epoch": 1.4397089397089398, "grad_norm": 0.5744822983902161, "learning_rate": 1.2425217748775464e-05, "loss": 0.6099, "step": 2770 }, { "epoch": 1.4423076923076923, "grad_norm": 0.5030366381929183, "learning_rate": 1.239586704721758e-05, "loss": 0.6067, "step": 2775 }, { "epoch": 1.444906444906445, "grad_norm": 0.5833985268491657, "learning_rate": 1.2366494432840937e-05, "loss": 0.6039, "step": 2780 }, { "epoch": 1.4475051975051976, "grad_norm": 0.5747742162047574, "learning_rate": 1.2337100174290142e-05, "loss": 0.6101, "step": 2785 }, { "epoch": 1.45010395010395, "grad_norm": 0.5356407427398536, "learning_rate": 1.2307684540407775e-05, "loss": 0.6055, "step": 2790 }, { "epoch": 1.4527027027027026, "grad_norm": 0.5413902409510034, "learning_rate": 1.2278247800231901e-05, "loss": 0.6162, "step": 2795 }, { "epoch": 1.4553014553014554, "grad_norm": 0.5361345781691861, "learning_rate": 1.2248790222993639e-05, "loss": 0.6132, "step": 2800 }, { "epoch": 1.457900207900208, "grad_norm": 0.48977234406410547, "learning_rate": 1.221931207811468e-05, "loss": 0.619, "step": 2805 }, { "epoch": 1.4604989604989604, "grad_norm": 0.5539199421254352, "learning_rate": 1.2189813635204825e-05, "loss": 0.6034, "step": 2810 }, { "epoch": 1.4630977130977132, "grad_norm": 0.5274980068953669, "learning_rate": 1.2160295164059529e-05, "loss": 0.6076, "step": 2815 }, { "epoch": 1.4656964656964657, "grad_norm": 0.5081900105077334, "learning_rate": 1.2130756934657424e-05, "loss": 0.6097, "step": 2820 }, { "epoch": 1.4682952182952183, "grad_norm": 0.5619754096937638, "learning_rate": 1.210119921715785e-05, "loss": 0.6156, "step": 2825 }, { "epoch": 1.4708939708939708, "grad_norm": 0.5058475060346515, "learning_rate": 1.2071622281898394e-05, "loss": 0.6119, "step": 2830 }, { "epoch": 1.4734927234927235, "grad_norm": 0.528937107568451, "learning_rate": 1.2042026399392403e-05, "loss": 0.6034, "step": 2835 }, { "epoch": 1.476091476091476, "grad_norm": 0.5585222059699902, "learning_rate": 1.2012411840326524e-05, "loss": 0.6122, "step": 2840 }, { "epoch": 1.4786902286902288, "grad_norm": 0.5474471042332577, "learning_rate": 1.1982778875558215e-05, "loss": 0.5978, "step": 2845 }, { "epoch": 1.4812889812889813, "grad_norm": 0.5637920526811849, "learning_rate": 1.1953127776113279e-05, "loss": 0.6097, "step": 2850 }, { "epoch": 1.4838877338877339, "grad_norm": 0.5153160827226365, "learning_rate": 1.192345881318338e-05, "loss": 0.6065, "step": 2855 }, { "epoch": 1.4864864864864864, "grad_norm": 0.5089185825931368, "learning_rate": 1.1893772258123554e-05, "loss": 0.5955, "step": 2860 }, { "epoch": 1.4890852390852392, "grad_norm": 0.5284121779832783, "learning_rate": 1.1864068382449756e-05, "loss": 0.6088, "step": 2865 }, { "epoch": 1.4916839916839917, "grad_norm": 0.5231059878227796, "learning_rate": 1.1834347457836337e-05, "loss": 0.5976, "step": 2870 }, { "epoch": 1.4942827442827442, "grad_norm": 0.5517740731632155, "learning_rate": 1.180460975611359e-05, "loss": 0.613, "step": 2875 }, { "epoch": 1.496881496881497, "grad_norm": 0.46831838517285146, "learning_rate": 1.1774855549265245e-05, "loss": 0.6053, "step": 2880 }, { "epoch": 1.4994802494802495, "grad_norm": 0.527557394883835, "learning_rate": 1.1745085109426002e-05, "loss": 0.6174, "step": 2885 }, { "epoch": 1.502079002079002, "grad_norm": 0.5200048942038921, "learning_rate": 1.171529870887902e-05, "loss": 0.6066, "step": 2890 }, { "epoch": 1.5046777546777546, "grad_norm": 0.5460408265611407, "learning_rate": 1.1685496620053434e-05, "loss": 0.6122, "step": 2895 }, { "epoch": 1.5072765072765073, "grad_norm": 0.5171487101859985, "learning_rate": 1.165567911552187e-05, "loss": 0.607, "step": 2900 }, { "epoch": 1.5098752598752598, "grad_norm": 0.5082429135678129, "learning_rate": 1.1625846467997952e-05, "loss": 0.6118, "step": 2905 }, { "epoch": 1.5124740124740126, "grad_norm": 0.536744119246903, "learning_rate": 1.1595998950333794e-05, "loss": 0.6228, "step": 2910 }, { "epoch": 1.5150727650727651, "grad_norm": 0.5540864582315153, "learning_rate": 1.1566136835517518e-05, "loss": 0.6085, "step": 2915 }, { "epoch": 1.5176715176715176, "grad_norm": 0.5480519199954694, "learning_rate": 1.1536260396670753e-05, "loss": 0.6038, "step": 2920 }, { "epoch": 1.5202702702702702, "grad_norm": 0.5320678068411181, "learning_rate": 1.1506369907046135e-05, "loss": 0.6027, "step": 2925 }, { "epoch": 1.5228690228690227, "grad_norm": 0.5559206845902772, "learning_rate": 1.1476465640024814e-05, "loss": 0.6082, "step": 2930 }, { "epoch": 1.5254677754677755, "grad_norm": 0.5919814949422626, "learning_rate": 1.1446547869113944e-05, "loss": 0.5897, "step": 2935 }, { "epoch": 1.5280665280665282, "grad_norm": 0.5327268055659626, "learning_rate": 1.1416616867944192e-05, "loss": 0.611, "step": 2940 }, { "epoch": 1.5306652806652807, "grad_norm": 0.4971186426325191, "learning_rate": 1.1386672910267225e-05, "loss": 0.6101, "step": 2945 }, { "epoch": 1.5332640332640333, "grad_norm": 0.5640128227568957, "learning_rate": 1.1356716269953213e-05, "loss": 0.6199, "step": 2950 }, { "epoch": 1.5358627858627858, "grad_norm": 0.5179662541283063, "learning_rate": 1.1326747220988327e-05, "loss": 0.6202, "step": 2955 }, { "epoch": 1.5384615384615383, "grad_norm": 0.6423145905392057, "learning_rate": 1.1296766037472223e-05, "loss": 0.6144, "step": 2960 }, { "epoch": 1.541060291060291, "grad_norm": 0.5256505864598588, "learning_rate": 1.1266772993615543e-05, "loss": 0.6066, "step": 2965 }, { "epoch": 1.5436590436590436, "grad_norm": 0.5209882272221003, "learning_rate": 1.1236768363737408e-05, "loss": 0.613, "step": 2970 }, { "epoch": 1.5462577962577964, "grad_norm": 0.5139682181751073, "learning_rate": 1.120675242226289e-05, "loss": 0.6195, "step": 2975 }, { "epoch": 1.5488565488565489, "grad_norm": 0.5285679185697464, "learning_rate": 1.1176725443720545e-05, "loss": 0.6074, "step": 2980 }, { "epoch": 1.5514553014553014, "grad_norm": 0.5176763822468469, "learning_rate": 1.1146687702739855e-05, "loss": 0.6225, "step": 2985 }, { "epoch": 1.554054054054054, "grad_norm": 0.5346252383786081, "learning_rate": 1.1116639474048741e-05, "loss": 0.5955, "step": 2990 }, { "epoch": 1.5566528066528067, "grad_norm": 0.5246377509399082, "learning_rate": 1.108658103247104e-05, "loss": 0.6075, "step": 2995 }, { "epoch": 1.5592515592515592, "grad_norm": 0.5852349160305579, "learning_rate": 1.1056512652924014e-05, "loss": 0.6102, "step": 3000 }, { "epoch": 1.561850311850312, "grad_norm": 0.5540954218703817, "learning_rate": 1.1026434610415804e-05, "loss": 0.6073, "step": 3005 }, { "epoch": 1.5644490644490645, "grad_norm": 0.516164831755444, "learning_rate": 1.099634718004293e-05, "loss": 0.6144, "step": 3010 }, { "epoch": 1.567047817047817, "grad_norm": 0.5238437043105261, "learning_rate": 1.0966250636987776e-05, "loss": 0.61, "step": 3015 }, { "epoch": 1.5696465696465696, "grad_norm": 0.5499703346154395, "learning_rate": 1.093614525651608e-05, "loss": 0.6, "step": 3020 }, { "epoch": 1.572245322245322, "grad_norm": 0.5392038397492541, "learning_rate": 1.0906031313974392e-05, "loss": 0.6004, "step": 3025 }, { "epoch": 1.5748440748440748, "grad_norm": 0.5440366683585401, "learning_rate": 1.0875909084787586e-05, "loss": 0.6079, "step": 3030 }, { "epoch": 1.5774428274428276, "grad_norm": 0.5280604613144251, "learning_rate": 1.0845778844456319e-05, "loss": 0.6028, "step": 3035 }, { "epoch": 1.5800415800415801, "grad_norm": 0.5130988979787711, "learning_rate": 1.0815640868554518e-05, "loss": 0.6255, "step": 3040 }, { "epoch": 1.5826403326403327, "grad_norm": 0.5347614455862642, "learning_rate": 1.0785495432726864e-05, "loss": 0.6144, "step": 3045 }, { "epoch": 1.5852390852390852, "grad_norm": 0.5540466808635207, "learning_rate": 1.0755342812686264e-05, "loss": 0.618, "step": 3050 }, { "epoch": 1.5878378378378377, "grad_norm": 0.5158267468916651, "learning_rate": 1.0725183284211335e-05, "loss": 0.6054, "step": 3055 }, { "epoch": 1.5904365904365905, "grad_norm": 0.5235550308126831, "learning_rate": 1.0695017123143881e-05, "loss": 0.6113, "step": 3060 }, { "epoch": 1.593035343035343, "grad_norm": 0.49676274074318394, "learning_rate": 1.0664844605386357e-05, "loss": 0.6066, "step": 3065 }, { "epoch": 1.5956340956340958, "grad_norm": 0.5091724259037824, "learning_rate": 1.0634666006899375e-05, "loss": 0.6059, "step": 3070 }, { "epoch": 1.5982328482328483, "grad_norm": 0.5308888501073562, "learning_rate": 1.0604481603699146e-05, "loss": 0.6077, "step": 3075 }, { "epoch": 1.6008316008316008, "grad_norm": 0.5668118121411413, "learning_rate": 1.0574291671854979e-05, "loss": 0.6119, "step": 3080 }, { "epoch": 1.6034303534303533, "grad_norm": 0.5232440524467463, "learning_rate": 1.054409648748675e-05, "loss": 0.6132, "step": 3085 }, { "epoch": 1.6060291060291059, "grad_norm": 0.5326956732038823, "learning_rate": 1.0513896326762363e-05, "loss": 0.5957, "step": 3090 }, { "epoch": 1.6086278586278586, "grad_norm": 0.5376136523378364, "learning_rate": 1.0483691465895256e-05, "loss": 0.5963, "step": 3095 }, { "epoch": 1.6112266112266114, "grad_norm": 0.5590406644575509, "learning_rate": 1.0453482181141838e-05, "loss": 0.6114, "step": 3100 }, { "epoch": 1.613825363825364, "grad_norm": 0.5348933441437478, "learning_rate": 1.0423268748798992e-05, "loss": 0.626, "step": 3105 }, { "epoch": 1.6164241164241164, "grad_norm": 0.5587808171684693, "learning_rate": 1.0393051445201518e-05, "loss": 0.6035, "step": 3110 }, { "epoch": 1.619022869022869, "grad_norm": 0.5217308721418593, "learning_rate": 1.0362830546719644e-05, "loss": 0.6007, "step": 3115 }, { "epoch": 1.6216216216216215, "grad_norm": 0.5331440823163403, "learning_rate": 1.0332606329756463e-05, "loss": 0.6103, "step": 3120 }, { "epoch": 1.6242203742203742, "grad_norm": 0.5354516402513061, "learning_rate": 1.030237907074542e-05, "loss": 0.6021, "step": 3125 }, { "epoch": 1.6268191268191268, "grad_norm": 0.5301206175827867, "learning_rate": 1.0272149046147788e-05, "loss": 0.6032, "step": 3130 }, { "epoch": 1.6294178794178795, "grad_norm": 0.5364702146724981, "learning_rate": 1.0241916532450133e-05, "loss": 0.6107, "step": 3135 }, { "epoch": 1.632016632016632, "grad_norm": 0.5030704592075379, "learning_rate": 1.0211681806161787e-05, "loss": 0.5984, "step": 3140 }, { "epoch": 1.6346153846153846, "grad_norm": 0.5001028568491547, "learning_rate": 1.0181445143812312e-05, "loss": 0.6011, "step": 3145 }, { "epoch": 1.637214137214137, "grad_norm": 0.5537298706648461, "learning_rate": 1.0151206821948985e-05, "loss": 0.6348, "step": 3150 }, { "epoch": 1.6398128898128899, "grad_norm": 0.5499538795880998, "learning_rate": 1.0120967117134262e-05, "loss": 0.6163, "step": 3155 }, { "epoch": 1.6424116424116424, "grad_norm": 0.4944029513235786, "learning_rate": 1.009072630594324e-05, "loss": 0.5997, "step": 3160 }, { "epoch": 1.6450103950103951, "grad_norm": 0.5560023248781629, "learning_rate": 1.0060484664961136e-05, "loss": 0.6066, "step": 3165 }, { "epoch": 1.6476091476091477, "grad_norm": 0.5228794932020453, "learning_rate": 1.0030242470780769e-05, "loss": 0.6049, "step": 3170 }, { "epoch": 1.6502079002079002, "grad_norm": 0.5251096124443742, "learning_rate": 1e-05, "loss": 0.617, "step": 3175 }, { "epoch": 1.6528066528066527, "grad_norm": 0.504719489023802, "learning_rate": 9.969757529219236e-06, "loss": 0.611, "step": 3180 }, { "epoch": 1.6554054054054053, "grad_norm": 0.5164130013232197, "learning_rate": 9.939515335038866e-06, "loss": 0.6071, "step": 3185 }, { "epoch": 1.658004158004158, "grad_norm": 0.503984804974549, "learning_rate": 9.909273694056765e-06, "loss": 0.6098, "step": 3190 }, { "epoch": 1.6606029106029108, "grad_norm": 0.5318145254626715, "learning_rate": 9.879032882865745e-06, "loss": 0.6046, "step": 3195 }, { "epoch": 1.6632016632016633, "grad_norm": 0.49979486457828537, "learning_rate": 9.848793178051017e-06, "loss": 0.5942, "step": 3200 }, { "epoch": 1.6658004158004158, "grad_norm": 0.5222561724594693, "learning_rate": 9.818554856187692e-06, "loss": 0.6102, "step": 3205 }, { "epoch": 1.6683991683991684, "grad_norm": 0.5119064608955575, "learning_rate": 9.788318193838218e-06, "loss": 0.6063, "step": 3210 }, { "epoch": 1.6709979209979209, "grad_norm": 0.49188265798150393, "learning_rate": 9.758083467549868e-06, "loss": 0.6007, "step": 3215 }, { "epoch": 1.6735966735966736, "grad_norm": 0.5307992559310489, "learning_rate": 9.727850953852217e-06, "loss": 0.6037, "step": 3220 }, { "epoch": 1.6761954261954262, "grad_norm": 0.5456235977768752, "learning_rate": 9.697620929254584e-06, "loss": 0.6244, "step": 3225 }, { "epoch": 1.678794178794179, "grad_norm": 0.5088649958340964, "learning_rate": 9.66739367024354e-06, "loss": 0.6042, "step": 3230 }, { "epoch": 1.6813929313929314, "grad_norm": 0.4953639561715028, "learning_rate": 9.63716945328036e-06, "loss": 0.5938, "step": 3235 }, { "epoch": 1.683991683991684, "grad_norm": 0.49505908823955036, "learning_rate": 9.606948554798482e-06, "loss": 0.6144, "step": 3240 }, { "epoch": 1.6865904365904365, "grad_norm": 0.5175987592879167, "learning_rate": 9.57673125120101e-06, "loss": 0.6098, "step": 3245 }, { "epoch": 1.689189189189189, "grad_norm": 0.5388656862756696, "learning_rate": 9.546517818858164e-06, "loss": 0.6171, "step": 3250 }, { "epoch": 1.6917879417879418, "grad_norm": 0.5153249162580613, "learning_rate": 9.516308534104744e-06, "loss": 0.5923, "step": 3255 }, { "epoch": 1.6943866943866945, "grad_norm": 0.5185938318204056, "learning_rate": 9.486103673237638e-06, "loss": 0.589, "step": 3260 }, { "epoch": 1.696985446985447, "grad_norm": 0.5735162818769731, "learning_rate": 9.455903512513257e-06, "loss": 0.6199, "step": 3265 }, { "epoch": 1.6995841995841996, "grad_norm": 0.5550865974556703, "learning_rate": 9.425708328145023e-06, "loss": 0.603, "step": 3270 }, { "epoch": 1.7021829521829521, "grad_norm": 0.5656039764802955, "learning_rate": 9.395518396300857e-06, "loss": 0.6036, "step": 3275 }, { "epoch": 1.7047817047817047, "grad_norm": 0.5356649490240522, "learning_rate": 9.365333993100628e-06, "loss": 0.5951, "step": 3280 }, { "epoch": 1.7073804573804574, "grad_norm": 0.5715455882322491, "learning_rate": 9.335155394613641e-06, "loss": 0.5989, "step": 3285 }, { "epoch": 1.70997920997921, "grad_norm": 0.5497264191896297, "learning_rate": 9.304982876856124e-06, "loss": 0.6058, "step": 3290 }, { "epoch": 1.7125779625779627, "grad_norm": 0.5051026018528313, "learning_rate": 9.274816715788668e-06, "loss": 0.5969, "step": 3295 }, { "epoch": 1.7151767151767152, "grad_norm": 0.5204788821196659, "learning_rate": 9.244657187313739e-06, "loss": 0.611, "step": 3300 }, { "epoch": 1.7177754677754677, "grad_norm": 0.48242559624890763, "learning_rate": 9.214504567273139e-06, "loss": 0.5893, "step": 3305 }, { "epoch": 1.7203742203742203, "grad_norm": 0.5475293749782204, "learning_rate": 9.184359131445487e-06, "loss": 0.6128, "step": 3310 }, { "epoch": 1.722972972972973, "grad_norm": 0.5227512974575209, "learning_rate": 9.154221155543684e-06, "loss": 0.5942, "step": 3315 }, { "epoch": 1.7255717255717256, "grad_norm": 0.517359580415827, "learning_rate": 9.124090915212415e-06, "loss": 0.5995, "step": 3320 }, { "epoch": 1.7281704781704783, "grad_norm": 0.5305121769843365, "learning_rate": 9.093968686025612e-06, "loss": 0.618, "step": 3325 }, { "epoch": 1.7307692307692308, "grad_norm": 0.5312713830882955, "learning_rate": 9.063854743483924e-06, "loss": 0.5929, "step": 3330 }, { "epoch": 1.7333679833679834, "grad_norm": 0.5159635712284465, "learning_rate": 9.033749363012228e-06, "loss": 0.5942, "step": 3335 }, { "epoch": 1.735966735966736, "grad_norm": 0.5304955336277648, "learning_rate": 9.003652819957073e-06, "loss": 0.5955, "step": 3340 }, { "epoch": 1.7385654885654884, "grad_norm": 0.5306004926529849, "learning_rate": 8.973565389584199e-06, "loss": 0.6157, "step": 3345 }, { "epoch": 1.7411642411642412, "grad_norm": 0.5287399261745209, "learning_rate": 8.943487347075988e-06, "loss": 0.5867, "step": 3350 }, { "epoch": 1.743762993762994, "grad_norm": 0.5230774773864855, "learning_rate": 8.91341896752896e-06, "loss": 0.5894, "step": 3355 }, { "epoch": 1.7463617463617465, "grad_norm": 0.5155471622168707, "learning_rate": 8.883360525951264e-06, "loss": 0.5958, "step": 3360 }, { "epoch": 1.748960498960499, "grad_norm": 0.5133214239778116, "learning_rate": 8.85331229726015e-06, "loss": 0.5935, "step": 3365 }, { "epoch": 1.7515592515592515, "grad_norm": 0.5319975207166266, "learning_rate": 8.823274556279455e-06, "loss": 0.5934, "step": 3370 }, { "epoch": 1.754158004158004, "grad_norm": 0.5364931909740585, "learning_rate": 8.793247577737112e-06, "loss": 0.6055, "step": 3375 }, { "epoch": 1.7567567567567568, "grad_norm": 0.5093682789742844, "learning_rate": 8.763231636262599e-06, "loss": 0.5904, "step": 3380 }, { "epoch": 1.7593555093555093, "grad_norm": 0.49218365344373355, "learning_rate": 8.733227006384459e-06, "loss": 0.6045, "step": 3385 }, { "epoch": 1.761954261954262, "grad_norm": 0.5463702062588134, "learning_rate": 8.703233962527779e-06, "loss": 0.6039, "step": 3390 }, { "epoch": 1.7645530145530146, "grad_norm": 0.5102092525737645, "learning_rate": 8.673252779011676e-06, "loss": 0.5887, "step": 3395 }, { "epoch": 1.7671517671517671, "grad_norm": 0.5268210778389424, "learning_rate": 8.643283730046788e-06, "loss": 0.5983, "step": 3400 }, { "epoch": 1.7697505197505197, "grad_norm": 0.5098708018226924, "learning_rate": 8.61332708973278e-06, "loss": 0.6043, "step": 3405 }, { "epoch": 1.7723492723492722, "grad_norm": 0.48835524185569673, "learning_rate": 8.583383132055814e-06, "loss": 0.6107, "step": 3410 }, { "epoch": 1.774948024948025, "grad_norm": 0.5701236303096751, "learning_rate": 8.55345213088606e-06, "loss": 0.6033, "step": 3415 }, { "epoch": 1.7775467775467777, "grad_norm": 0.5137867247566509, "learning_rate": 8.52353435997519e-06, "loss": 0.5988, "step": 3420 }, { "epoch": 1.7801455301455302, "grad_norm": 0.5185967787599991, "learning_rate": 8.49363009295387e-06, "loss": 0.6027, "step": 3425 }, { "epoch": 1.7827442827442828, "grad_norm": 0.5232087879326293, "learning_rate": 8.46373960332925e-06, "loss": 0.5958, "step": 3430 }, { "epoch": 1.7853430353430353, "grad_norm": 0.5227750785275999, "learning_rate": 8.433863164482485e-06, "loss": 0.6087, "step": 3435 }, { "epoch": 1.7879417879417878, "grad_norm": 0.4796440456103048, "learning_rate": 8.404001049666211e-06, "loss": 0.5961, "step": 3440 }, { "epoch": 1.7905405405405406, "grad_norm": 0.5114161067261779, "learning_rate": 8.37415353200205e-06, "loss": 0.5975, "step": 3445 }, { "epoch": 1.793139293139293, "grad_norm": 0.5368539216036579, "learning_rate": 8.344320884478133e-06, "loss": 0.5995, "step": 3450 }, { "epoch": 1.7957380457380459, "grad_norm": 0.5251230847938383, "learning_rate": 8.314503379946569e-06, "loss": 0.5924, "step": 3455 }, { "epoch": 1.7983367983367984, "grad_norm": 0.5125606084891738, "learning_rate": 8.284701291120984e-06, "loss": 0.59, "step": 3460 }, { "epoch": 1.800935550935551, "grad_norm": 0.5082724750112706, "learning_rate": 8.254914890574001e-06, "loss": 0.5783, "step": 3465 }, { "epoch": 1.8035343035343034, "grad_norm": 0.5857171673424286, "learning_rate": 8.225144450734755e-06, "loss": 0.6159, "step": 3470 }, { "epoch": 1.806133056133056, "grad_norm": 0.5189085809502059, "learning_rate": 8.195390243886414e-06, "loss": 0.5876, "step": 3475 }, { "epoch": 1.8087318087318087, "grad_norm": 0.5054176942242024, "learning_rate": 8.165652542163668e-06, "loss": 0.6018, "step": 3480 }, { "epoch": 1.8113305613305615, "grad_norm": 0.5245871555142563, "learning_rate": 8.135931617550245e-06, "loss": 0.607, "step": 3485 }, { "epoch": 1.813929313929314, "grad_norm": 0.5240802764153503, "learning_rate": 8.106227741876447e-06, "loss": 0.6074, "step": 3490 }, { "epoch": 1.8165280665280665, "grad_norm": 0.5431345881991243, "learning_rate": 8.076541186816625e-06, "loss": 0.6002, "step": 3495 }, { "epoch": 1.819126819126819, "grad_norm": 0.5192080223913004, "learning_rate": 8.046872223886723e-06, "loss": 0.6039, "step": 3500 }, { "epoch": 1.8217255717255716, "grad_norm": 0.5377132118040553, "learning_rate": 8.017221124441787e-06, "loss": 0.5866, "step": 3505 }, { "epoch": 1.8243243243243243, "grad_norm": 0.4848076231447858, "learning_rate": 7.98758815967348e-06, "loss": 0.5926, "step": 3510 }, { "epoch": 1.8269230769230769, "grad_norm": 0.49613490454069115, "learning_rate": 7.957973600607597e-06, "loss": 0.6029, "step": 3515 }, { "epoch": 1.8295218295218296, "grad_norm": 0.49532299518482037, "learning_rate": 7.92837771810161e-06, "loss": 0.5893, "step": 3520 }, { "epoch": 1.8321205821205822, "grad_norm": 0.5240393625504302, "learning_rate": 7.898800782842153e-06, "loss": 0.6044, "step": 3525 }, { "epoch": 1.8347193347193347, "grad_norm": 0.5378508353167911, "learning_rate": 7.86924306534258e-06, "loss": 0.5892, "step": 3530 }, { "epoch": 1.8373180873180872, "grad_norm": 0.5199976481026775, "learning_rate": 7.839704835940473e-06, "loss": 0.5982, "step": 3535 }, { "epoch": 1.83991683991684, "grad_norm": 0.529729165924642, "learning_rate": 7.81018636479518e-06, "loss": 0.6012, "step": 3540 }, { "epoch": 1.8425155925155925, "grad_norm": 0.4916121082024032, "learning_rate": 7.780687921885324e-06, "loss": 0.5977, "step": 3545 }, { "epoch": 1.8451143451143452, "grad_norm": 0.5575500880550704, "learning_rate": 7.751209777006363e-06, "loss": 0.6003, "step": 3550 }, { "epoch": 1.8477130977130978, "grad_norm": 0.5326362594297853, "learning_rate": 7.7217521997681e-06, "loss": 0.6039, "step": 3555 }, { "epoch": 1.8503118503118503, "grad_norm": 0.6375012050960875, "learning_rate": 7.69231545959223e-06, "loss": 0.5903, "step": 3560 }, { "epoch": 1.8529106029106028, "grad_norm": 0.5127398317219315, "learning_rate": 7.66289982570986e-06, "loss": 0.6127, "step": 3565 }, { "epoch": 1.8555093555093554, "grad_norm": 0.5337342117702417, "learning_rate": 7.633505567159068e-06, "loss": 0.6106, "step": 3570 }, { "epoch": 1.8581081081081081, "grad_norm": 0.5127146513499672, "learning_rate": 7.604132952782421e-06, "loss": 0.593, "step": 3575 }, { "epoch": 1.8607068607068609, "grad_norm": 0.5475181259322507, "learning_rate": 7.574782251224541e-06, "loss": 0.6087, "step": 3580 }, { "epoch": 1.8633056133056134, "grad_norm": 0.5057492212324644, "learning_rate": 7.545453730929612e-06, "loss": 0.5961, "step": 3585 }, { "epoch": 1.865904365904366, "grad_norm": 0.5313703879609416, "learning_rate": 7.516147660138968e-06, "loss": 0.5826, "step": 3590 }, { "epoch": 1.8685031185031185, "grad_norm": 0.5198552151529012, "learning_rate": 7.486864306888608e-06, "loss": 0.6015, "step": 3595 }, { "epoch": 1.871101871101871, "grad_norm": 0.5410281063432927, "learning_rate": 7.457603939006745e-06, "loss": 0.6033, "step": 3600 }, { "epoch": 1.8737006237006237, "grad_norm": 0.5557013181252524, "learning_rate": 7.428366824111386e-06, "loss": 0.5902, "step": 3605 }, { "epoch": 1.8762993762993763, "grad_norm": 0.5633083487264265, "learning_rate": 7.399153229607849e-06, "loss": 0.6018, "step": 3610 }, { "epoch": 1.878898128898129, "grad_norm": 0.48595216285541615, "learning_rate": 7.369963422686335e-06, "loss": 0.594, "step": 3615 }, { "epoch": 1.8814968814968815, "grad_norm": 0.5169705729740565, "learning_rate": 7.340797670319488e-06, "loss": 0.5899, "step": 3620 }, { "epoch": 1.884095634095634, "grad_norm": 0.4970606877334214, "learning_rate": 7.311656239259934e-06, "loss": 0.6148, "step": 3625 }, { "epoch": 1.8866943866943866, "grad_norm": 0.5178433096469348, "learning_rate": 7.282539396037868e-06, "loss": 0.59, "step": 3630 }, { "epoch": 1.8892931392931391, "grad_norm": 0.5085741805913727, "learning_rate": 7.253447406958598e-06, "loss": 0.5969, "step": 3635 }, { "epoch": 1.8918918918918919, "grad_norm": 0.5339633821078309, "learning_rate": 7.2243805381001084e-06, "loss": 0.6013, "step": 3640 }, { "epoch": 1.8944906444906446, "grad_norm": 0.5142299591444427, "learning_rate": 7.195339055310635e-06, "loss": 0.605, "step": 3645 }, { "epoch": 1.8970893970893972, "grad_norm": 0.53012102257086, "learning_rate": 7.166323224206236e-06, "loss": 0.5934, "step": 3650 }, { "epoch": 1.8996881496881497, "grad_norm": 0.5180950494011575, "learning_rate": 7.13733331016835e-06, "loss": 0.5967, "step": 3655 }, { "epoch": 1.9022869022869022, "grad_norm": 0.5183177508817899, "learning_rate": 7.108369578341372e-06, "loss": 0.5823, "step": 3660 }, { "epoch": 1.9048856548856548, "grad_norm": 0.5070875844600755, "learning_rate": 7.079432293630244e-06, "loss": 0.5956, "step": 3665 }, { "epoch": 1.9074844074844075, "grad_norm": 0.6274267217296448, "learning_rate": 7.050521720698009e-06, "loss": 0.6114, "step": 3670 }, { "epoch": 1.91008316008316, "grad_norm": 0.5602574620780145, "learning_rate": 7.021638123963415e-06, "loss": 0.586, "step": 3675 }, { "epoch": 1.9126819126819128, "grad_norm": 0.517425035346988, "learning_rate": 6.992781767598467e-06, "loss": 0.5937, "step": 3680 }, { "epoch": 1.9152806652806653, "grad_norm": 0.49392256995221356, "learning_rate": 6.9639529155260355e-06, "loss": 0.5893, "step": 3685 }, { "epoch": 1.9178794178794178, "grad_norm": 0.5221453351667464, "learning_rate": 6.935151831417442e-06, "loss": 0.5921, "step": 3690 }, { "epoch": 1.9204781704781704, "grad_norm": 0.5437297366337159, "learning_rate": 6.906378778690023e-06, "loss": 0.5941, "step": 3695 }, { "epoch": 1.9230769230769231, "grad_norm": 0.5526585356985603, "learning_rate": 6.8776340205047446e-06, "loss": 0.5879, "step": 3700 }, { "epoch": 1.9256756756756757, "grad_norm": 0.47075169096755787, "learning_rate": 6.848917819763794e-06, "loss": 0.587, "step": 3705 }, { "epoch": 1.9282744282744284, "grad_norm": 0.5318393319444799, "learning_rate": 6.8202304391081665e-06, "loss": 0.5961, "step": 3710 }, { "epoch": 1.930873180873181, "grad_norm": 0.5429244483259561, "learning_rate": 6.791572140915258e-06, "loss": 0.5972, "step": 3715 }, { "epoch": 1.9334719334719335, "grad_norm": 0.7824999181116893, "learning_rate": 6.762943187296487e-06, "loss": 0.6025, "step": 3720 }, { "epoch": 1.936070686070686, "grad_norm": 0.5506609069987528, "learning_rate": 6.734343840094877e-06, "loss": 0.5935, "step": 3725 }, { "epoch": 1.9386694386694385, "grad_norm": 0.5393169028265578, "learning_rate": 6.705774360882662e-06, "loss": 0.5998, "step": 3730 }, { "epoch": 1.9412681912681913, "grad_norm": 0.5196746072745972, "learning_rate": 6.677235010958916e-06, "loss": 0.6024, "step": 3735 }, { "epoch": 1.943866943866944, "grad_norm": 0.5161195299496159, "learning_rate": 6.648726051347132e-06, "loss": 0.5923, "step": 3740 }, { "epoch": 1.9464656964656966, "grad_norm": 0.5385756600061604, "learning_rate": 6.6202477427928604e-06, "loss": 0.5936, "step": 3745 }, { "epoch": 1.949064449064449, "grad_norm": 0.5068176753311672, "learning_rate": 6.591800345761313e-06, "loss": 0.5857, "step": 3750 }, { "epoch": 1.9516632016632016, "grad_norm": 0.5156660031341467, "learning_rate": 6.563384120434978e-06, "loss": 0.5998, "step": 3755 }, { "epoch": 1.9542619542619541, "grad_norm": 0.5915967761576071, "learning_rate": 6.5349993267112455e-06, "loss": 0.5901, "step": 3760 }, { "epoch": 1.956860706860707, "grad_norm": 0.5286753068152813, "learning_rate": 6.506646224200036e-06, "loss": 0.606, "step": 3765 }, { "epoch": 1.9594594594594594, "grad_norm": 0.5510621595391834, "learning_rate": 6.4783250722214066e-06, "loss": 0.5996, "step": 3770 }, { "epoch": 1.9620582120582122, "grad_norm": 0.5050260932024032, "learning_rate": 6.450036129803205e-06, "loss": 0.5811, "step": 3775 }, { "epoch": 1.9646569646569647, "grad_norm": 0.5187426414977963, "learning_rate": 6.42177965567868e-06, "loss": 0.6012, "step": 3780 }, { "epoch": 1.9672557172557172, "grad_norm": 0.5226860354335852, "learning_rate": 6.393555908284119e-06, "loss": 0.6002, "step": 3785 }, { "epoch": 1.9698544698544698, "grad_norm": 0.5394751859696948, "learning_rate": 6.3653651457565005e-06, "loss": 0.6049, "step": 3790 }, { "epoch": 1.9724532224532223, "grad_norm": 0.5041393147335839, "learning_rate": 6.337207625931105e-06, "loss": 0.5995, "step": 3795 }, { "epoch": 1.975051975051975, "grad_norm": 0.533267009949286, "learning_rate": 6.309083606339184e-06, "loss": 0.5845, "step": 3800 }, { "epoch": 1.9776507276507278, "grad_norm": 0.5376844644903337, "learning_rate": 6.28099334420559e-06, "loss": 0.5889, "step": 3805 }, { "epoch": 1.9802494802494803, "grad_norm": 0.5424651587833641, "learning_rate": 6.252937096446422e-06, "loss": 0.5931, "step": 3810 }, { "epoch": 1.9828482328482329, "grad_norm": 0.5323019777176436, "learning_rate": 6.224915119666682e-06, "loss": 0.6001, "step": 3815 }, { "epoch": 1.9854469854469854, "grad_norm": 0.5719566751743559, "learning_rate": 6.196927670157931e-06, "loss": 0.5969, "step": 3820 }, { "epoch": 1.988045738045738, "grad_norm": 0.5144348819478973, "learning_rate": 6.168975003895939e-06, "loss": 0.6027, "step": 3825 }, { "epoch": 1.9906444906444907, "grad_norm": 0.5171213896946363, "learning_rate": 6.141057376538338e-06, "loss": 0.5986, "step": 3830 }, { "epoch": 1.9932432432432432, "grad_norm": 0.5178977499722083, "learning_rate": 6.113175043422301e-06, "loss": 0.6069, "step": 3835 }, { "epoch": 1.995841995841996, "grad_norm": 0.5264478858379251, "learning_rate": 6.085328259562195e-06, "loss": 0.5939, "step": 3840 }, { "epoch": 1.9984407484407485, "grad_norm": 0.49088468990078843, "learning_rate": 6.0575172796472405e-06, "loss": 0.5899, "step": 3845 }, { "epoch": 2.0, "eval_loss": 0.7568330764770508, "eval_runtime": 106.5795, "eval_samples_per_second": 77.032, "eval_steps_per_second": 1.21, "step": 3848 }, { "epoch": 2.001039501039501, "grad_norm": 0.6668925862814008, "learning_rate": 6.0297423580392055e-06, "loss": 0.5449, "step": 3850 }, { "epoch": 2.0036382536382535, "grad_norm": 0.6496810654128746, "learning_rate": 6.002003748770055e-06, "loss": 0.5054, "step": 3855 }, { "epoch": 2.006237006237006, "grad_norm": 0.6031011324917133, "learning_rate": 5.9743017055396424e-06, "loss": 0.508, "step": 3860 }, { "epoch": 2.008835758835759, "grad_norm": 0.6010814236947867, "learning_rate": 5.9466364817133886e-06, "loss": 0.5042, "step": 3865 }, { "epoch": 2.0114345114345116, "grad_norm": 0.5728886202402685, "learning_rate": 5.9190083303199505e-06, "loss": 0.5013, "step": 3870 }, { "epoch": 2.014033264033264, "grad_norm": 0.5424146827083851, "learning_rate": 5.891417504048926e-06, "loss": 0.5075, "step": 3875 }, { "epoch": 2.0166320166320166, "grad_norm": 0.5790413975893587, "learning_rate": 5.863864255248533e-06, "loss": 0.5179, "step": 3880 }, { "epoch": 2.019230769230769, "grad_norm": 0.5318824862807435, "learning_rate": 5.836348835923299e-06, "loss": 0.5068, "step": 3885 }, { "epoch": 2.0218295218295217, "grad_norm": 0.550883650215065, "learning_rate": 5.808871497731758e-06, "loss": 0.4974, "step": 3890 }, { "epoch": 2.024428274428274, "grad_norm": 0.5879882582188948, "learning_rate": 5.781432491984162e-06, "loss": 0.5113, "step": 3895 }, { "epoch": 2.027027027027027, "grad_norm": 0.5546832686816904, "learning_rate": 5.754032069640153e-06, "loss": 0.5063, "step": 3900 }, { "epoch": 2.0296257796257797, "grad_norm": 0.5211604329798696, "learning_rate": 5.726670481306505e-06, "loss": 0.5052, "step": 3905 }, { "epoch": 2.0322245322245323, "grad_norm": 0.5455878123275217, "learning_rate": 5.699347977234799e-06, "loss": 0.5053, "step": 3910 }, { "epoch": 2.034823284823285, "grad_norm": 0.5950657410818389, "learning_rate": 5.672064807319146e-06, "loss": 0.5152, "step": 3915 }, { "epoch": 2.0374220374220373, "grad_norm": 0.5858939065311778, "learning_rate": 5.644821221093916e-06, "loss": 0.5059, "step": 3920 }, { "epoch": 2.04002079002079, "grad_norm": 0.5347458170039379, "learning_rate": 5.617617467731438e-06, "loss": 0.5112, "step": 3925 }, { "epoch": 2.042619542619543, "grad_norm": 0.5384308493309783, "learning_rate": 5.5904537960397155e-06, "loss": 0.4975, "step": 3930 }, { "epoch": 2.0452182952182953, "grad_norm": 0.5730691087208541, "learning_rate": 5.563330454460179e-06, "loss": 0.4961, "step": 3935 }, { "epoch": 2.047817047817048, "grad_norm": 0.5477717438993087, "learning_rate": 5.536247691065384e-06, "loss": 0.5121, "step": 3940 }, { "epoch": 2.0504158004158004, "grad_norm": 0.5548351105040114, "learning_rate": 5.50920575355675e-06, "loss": 0.5079, "step": 3945 }, { "epoch": 2.053014553014553, "grad_norm": 0.5759237090673845, "learning_rate": 5.482204889262319e-06, "loss": 0.5093, "step": 3950 }, { "epoch": 2.0556133056133055, "grad_norm": 0.5547272581679922, "learning_rate": 5.455245345134449e-06, "loss": 0.4965, "step": 3955 }, { "epoch": 2.0582120582120584, "grad_norm": 0.5545227019423067, "learning_rate": 5.428327367747598e-06, "loss": 0.5056, "step": 3960 }, { "epoch": 2.060810810810811, "grad_norm": 0.5475321378756351, "learning_rate": 5.401451203296049e-06, "loss": 0.4992, "step": 3965 }, { "epoch": 2.0634095634095635, "grad_norm": 0.5571990780758471, "learning_rate": 5.37461709759165e-06, "loss": 0.5029, "step": 3970 }, { "epoch": 2.066008316008316, "grad_norm": 0.5646750653448925, "learning_rate": 5.3478252960615794e-06, "loss": 0.5045, "step": 3975 }, { "epoch": 2.0686070686070686, "grad_norm": 0.5748986438531573, "learning_rate": 5.321076043746108e-06, "loss": 0.4982, "step": 3980 }, { "epoch": 2.071205821205821, "grad_norm": 0.5627371455320099, "learning_rate": 5.2943695852963325e-06, "loss": 0.5096, "step": 3985 }, { "epoch": 2.0738045738045736, "grad_norm": 0.5535736181815755, "learning_rate": 5.267706164971966e-06, "loss": 0.502, "step": 3990 }, { "epoch": 2.0764033264033266, "grad_norm": 0.5361674621317485, "learning_rate": 5.241086026639079e-06, "loss": 0.5056, "step": 3995 }, { "epoch": 2.079002079002079, "grad_norm": 0.5644675385907009, "learning_rate": 5.214509413767892e-06, "loss": 0.5142, "step": 4000 }, { "epoch": 2.0816008316008316, "grad_norm": 0.5784423395730652, "learning_rate": 5.187976569430535e-06, "loss": 0.5087, "step": 4005 }, { "epoch": 2.084199584199584, "grad_norm": 0.592275009867849, "learning_rate": 5.1614877362988205e-06, "loss": 0.5027, "step": 4010 }, { "epoch": 2.0867983367983367, "grad_norm": 0.5950969421446421, "learning_rate": 5.1350431566420326e-06, "loss": 0.5046, "step": 4015 }, { "epoch": 2.0893970893970892, "grad_norm": 0.5446288597399254, "learning_rate": 5.108643072324717e-06, "loss": 0.5107, "step": 4020 }, { "epoch": 2.091995841995842, "grad_norm": 0.5839095060604741, "learning_rate": 5.082287724804453e-06, "loss": 0.507, "step": 4025 }, { "epoch": 2.0945945945945947, "grad_norm": 0.5801086689129009, "learning_rate": 5.055977355129653e-06, "loss": 0.5007, "step": 4030 }, { "epoch": 2.0971933471933473, "grad_norm": 0.5394364509572592, "learning_rate": 5.02971220393736e-06, "loss": 0.5079, "step": 4035 }, { "epoch": 2.0997920997921, "grad_norm": 0.5649634959442216, "learning_rate": 5.003492511451051e-06, "loss": 0.5042, "step": 4040 }, { "epoch": 2.1023908523908523, "grad_norm": 0.5697263888969452, "learning_rate": 4.977318517478421e-06, "loss": 0.5012, "step": 4045 }, { "epoch": 2.104989604989605, "grad_norm": 0.6166161420968725, "learning_rate": 4.951190461409214e-06, "loss": 0.511, "step": 4050 }, { "epoch": 2.1075883575883574, "grad_norm": 0.5624903877149114, "learning_rate": 4.925108582213013e-06, "loss": 0.5104, "step": 4055 }, { "epoch": 2.1101871101871104, "grad_norm": 0.5731533074752744, "learning_rate": 4.899073118437063e-06, "loss": 0.5109, "step": 4060 }, { "epoch": 2.112785862785863, "grad_norm": 0.5800809144559984, "learning_rate": 4.873084308204101e-06, "loss": 0.4999, "step": 4065 }, { "epoch": 2.1153846153846154, "grad_norm": 0.5639967552020521, "learning_rate": 4.84714238921015e-06, "loss": 0.4972, "step": 4070 }, { "epoch": 2.117983367983368, "grad_norm": 0.5540959676849216, "learning_rate": 4.821247598722373e-06, "loss": 0.4887, "step": 4075 }, { "epoch": 2.1205821205821205, "grad_norm": 0.5820289593717347, "learning_rate": 4.7954001735768925e-06, "loss": 0.4983, "step": 4080 }, { "epoch": 2.123180873180873, "grad_norm": 0.5665889825124238, "learning_rate": 4.7696003501766155e-06, "loss": 0.4928, "step": 4085 }, { "epoch": 2.125779625779626, "grad_norm": 0.5786397074647865, "learning_rate": 4.7438483644890776e-06, "loss": 0.509, "step": 4090 }, { "epoch": 2.1283783783783785, "grad_norm": 0.5544771675503383, "learning_rate": 4.718144452044299e-06, "loss": 0.5088, "step": 4095 }, { "epoch": 2.130977130977131, "grad_norm": 0.5842089375560309, "learning_rate": 4.692488847932601e-06, "loss": 0.5131, "step": 4100 }, { "epoch": 2.1335758835758836, "grad_norm": 0.5572667236950973, "learning_rate": 4.666881786802492e-06, "loss": 0.513, "step": 4105 }, { "epoch": 2.136174636174636, "grad_norm": 0.6261912246125306, "learning_rate": 4.6413235028584804e-06, "loss": 0.5053, "step": 4110 }, { "epoch": 2.1387733887733886, "grad_norm": 0.5702222004267216, "learning_rate": 4.615814229858969e-06, "loss": 0.495, "step": 4115 }, { "epoch": 2.141372141372141, "grad_norm": 0.5790166866248228, "learning_rate": 4.590354201114103e-06, "loss": 0.4973, "step": 4120 }, { "epoch": 2.143970893970894, "grad_norm": 0.5603345931162405, "learning_rate": 4.564943649483625e-06, "loss": 0.5063, "step": 4125 }, { "epoch": 2.1465696465696467, "grad_norm": 0.5569620723069888, "learning_rate": 4.539582807374756e-06, "loss": 0.4982, "step": 4130 }, { "epoch": 2.149168399168399, "grad_norm": 0.6019143400672264, "learning_rate": 4.514271906740082e-06, "loss": 0.5116, "step": 4135 }, { "epoch": 2.1517671517671517, "grad_norm": 0.5668221330685952, "learning_rate": 4.489011179075408e-06, "loss": 0.4989, "step": 4140 }, { "epoch": 2.1543659043659042, "grad_norm": 0.5777522818500115, "learning_rate": 4.46380085541765e-06, "loss": 0.4866, "step": 4145 }, { "epoch": 2.156964656964657, "grad_norm": 0.5577057043245417, "learning_rate": 4.438641166342733e-06, "loss": 0.5048, "step": 4150 }, { "epoch": 2.1595634095634098, "grad_norm": 0.5811543313527234, "learning_rate": 4.413532341963477e-06, "loss": 0.5024, "step": 4155 }, { "epoch": 2.1621621621621623, "grad_norm": 0.5901491166344425, "learning_rate": 4.388474611927472e-06, "loss": 0.4985, "step": 4160 }, { "epoch": 2.164760914760915, "grad_norm": 0.5537573132486768, "learning_rate": 4.363468205415014e-06, "loss": 0.4956, "step": 4165 }, { "epoch": 2.1673596673596673, "grad_norm": 0.5660909275231115, "learning_rate": 4.338513351136977e-06, "loss": 0.4928, "step": 4170 }, { "epoch": 2.16995841995842, "grad_norm": 0.5991376447658537, "learning_rate": 4.313610277332732e-06, "loss": 0.499, "step": 4175 }, { "epoch": 2.1725571725571724, "grad_norm": 0.6075244421550833, "learning_rate": 4.288759211768072e-06, "loss": 0.5033, "step": 4180 }, { "epoch": 2.1751559251559254, "grad_norm": 0.5517113456938116, "learning_rate": 4.263960381733106e-06, "loss": 0.4951, "step": 4185 }, { "epoch": 2.177754677754678, "grad_norm": 0.5677317519142169, "learning_rate": 4.2392140140401996e-06, "loss": 0.4978, "step": 4190 }, { "epoch": 2.1803534303534304, "grad_norm": 0.569770318980704, "learning_rate": 4.214520335021896e-06, "loss": 0.4939, "step": 4195 }, { "epoch": 2.182952182952183, "grad_norm": 0.6118856781558967, "learning_rate": 4.189879570528831e-06, "loss": 0.5069, "step": 4200 }, { "epoch": 2.1855509355509355, "grad_norm": 0.606019484002795, "learning_rate": 4.165291945927693e-06, "loss": 0.5043, "step": 4205 }, { "epoch": 2.188149688149688, "grad_norm": 0.5556726937928989, "learning_rate": 4.140757686099137e-06, "loss": 0.4868, "step": 4210 }, { "epoch": 2.1907484407484406, "grad_norm": 0.5837653560310493, "learning_rate": 4.116277015435743e-06, "loss": 0.5015, "step": 4215 }, { "epoch": 2.1933471933471935, "grad_norm": 0.588179392816627, "learning_rate": 4.091850157839963e-06, "loss": 0.503, "step": 4220 }, { "epoch": 2.195945945945946, "grad_norm": 0.5994112141091228, "learning_rate": 4.067477336722063e-06, "loss": 0.5124, "step": 4225 }, { "epoch": 2.1985446985446986, "grad_norm": 0.6035705377584152, "learning_rate": 4.043158774998093e-06, "loss": 0.5089, "step": 4230 }, { "epoch": 2.201143451143451, "grad_norm": 0.5688655453887249, "learning_rate": 4.01889469508784e-06, "loss": 0.5043, "step": 4235 }, { "epoch": 2.2037422037422036, "grad_norm": 0.5917248307889117, "learning_rate": 3.994685318912794e-06, "loss": 0.5163, "step": 4240 }, { "epoch": 2.206340956340956, "grad_norm": 0.5394197139265716, "learning_rate": 3.970530867894114e-06, "loss": 0.5069, "step": 4245 }, { "epoch": 2.208939708939709, "grad_norm": 0.576389590721846, "learning_rate": 3.946431562950624e-06, "loss": 0.5005, "step": 4250 }, { "epoch": 2.2115384615384617, "grad_norm": 0.595056352282733, "learning_rate": 3.922387624496762e-06, "loss": 0.5043, "step": 4255 }, { "epoch": 2.214137214137214, "grad_norm": 0.5572014241693316, "learning_rate": 3.89839927244058e-06, "loss": 0.5074, "step": 4260 }, { "epoch": 2.2167359667359667, "grad_norm": 0.5659607082452609, "learning_rate": 3.87446672618174e-06, "loss": 0.5078, "step": 4265 }, { "epoch": 2.2193347193347193, "grad_norm": 0.5659519225313114, "learning_rate": 3.850590204609501e-06, "loss": 0.5042, "step": 4270 }, { "epoch": 2.221933471933472, "grad_norm": 0.5520449827126801, "learning_rate": 3.826769926100699e-06, "loss": 0.5049, "step": 4275 }, { "epoch": 2.2245322245322248, "grad_norm": 0.9464101657600216, "learning_rate": 3.803006108517786e-06, "loss": 0.5049, "step": 4280 }, { "epoch": 2.2271309771309773, "grad_norm": 0.5678141927512144, "learning_rate": 3.7792989692068018e-06, "loss": 0.5035, "step": 4285 }, { "epoch": 2.22972972972973, "grad_norm": 0.5706457165389711, "learning_rate": 3.755648724995404e-06, "loss": 0.4968, "step": 4290 }, { "epoch": 2.2323284823284824, "grad_norm": 0.6171952232229321, "learning_rate": 3.732055592190893e-06, "loss": 0.5082, "step": 4295 }, { "epoch": 2.234927234927235, "grad_norm": 0.600087841592369, "learning_rate": 3.7085197865782085e-06, "loss": 0.5039, "step": 4300 }, { "epoch": 2.2375259875259874, "grad_norm": 0.5625791681960587, "learning_rate": 3.6850415234179805e-06, "loss": 0.5041, "step": 4305 }, { "epoch": 2.24012474012474, "grad_norm": 0.596631151098558, "learning_rate": 3.661621017444551e-06, "loss": 0.5013, "step": 4310 }, { "epoch": 2.242723492723493, "grad_norm": 0.5324892002802353, "learning_rate": 3.638258482863999e-06, "loss": 0.4958, "step": 4315 }, { "epoch": 2.2453222453222454, "grad_norm": 0.5720843807653389, "learning_rate": 3.6149541333522053e-06, "loss": 0.4994, "step": 4320 }, { "epoch": 2.247920997920998, "grad_norm": 0.5550466004827054, "learning_rate": 3.5917081820528765e-06, "loss": 0.5066, "step": 4325 }, { "epoch": 2.2505197505197505, "grad_norm": 0.5853024960860177, "learning_rate": 3.568520841575601e-06, "loss": 0.4984, "step": 4330 }, { "epoch": 2.253118503118503, "grad_norm": 0.5591994418510899, "learning_rate": 3.5453923239939192e-06, "loss": 0.5057, "step": 4335 }, { "epoch": 2.2557172557172556, "grad_norm": 0.595876343661196, "learning_rate": 3.5223228408433564e-06, "loss": 0.4978, "step": 4340 }, { "epoch": 2.258316008316008, "grad_norm": 0.5689153116175016, "learning_rate": 3.499312603119517e-06, "loss": 0.5045, "step": 4345 }, { "epoch": 2.260914760914761, "grad_norm": 0.617131890152672, "learning_rate": 3.4763618212761376e-06, "loss": 0.5068, "step": 4350 }, { "epoch": 2.2635135135135136, "grad_norm": 0.5756546456394432, "learning_rate": 3.453470705223162e-06, "loss": 0.5006, "step": 4355 }, { "epoch": 2.266112266112266, "grad_norm": 0.5904214264537652, "learning_rate": 3.430639464324825e-06, "loss": 0.509, "step": 4360 }, { "epoch": 2.2687110187110187, "grad_norm": 0.5486092941094705, "learning_rate": 3.407868307397747e-06, "loss": 0.4956, "step": 4365 }, { "epoch": 2.271309771309771, "grad_norm": 0.6341681884960043, "learning_rate": 3.3851574427090028e-06, "loss": 0.502, "step": 4370 }, { "epoch": 2.2739085239085237, "grad_norm": 0.5816609147620979, "learning_rate": 3.362507077974234e-06, "loss": 0.5053, "step": 4375 }, { "epoch": 2.2765072765072767, "grad_norm": 0.5992096025731823, "learning_rate": 3.339917420355746e-06, "loss": 0.4915, "step": 4380 }, { "epoch": 2.279106029106029, "grad_norm": 0.5634477819700985, "learning_rate": 3.3173886764606133e-06, "loss": 0.5034, "step": 4385 }, { "epoch": 2.2817047817047817, "grad_norm": 0.5407976728647481, "learning_rate": 3.2949210523387786e-06, "loss": 0.4999, "step": 4390 }, { "epoch": 2.2843035343035343, "grad_norm": 0.562584471586657, "learning_rate": 3.2725147534811885e-06, "loss": 0.502, "step": 4395 }, { "epoch": 2.286902286902287, "grad_norm": 0.5561376196303791, "learning_rate": 3.250169984817897e-06, "loss": 0.4996, "step": 4400 }, { "epoch": 2.2895010395010393, "grad_norm": 0.5288577740017452, "learning_rate": 3.2278869507161947e-06, "loss": 0.4923, "step": 4405 }, { "epoch": 2.2920997920997923, "grad_norm": 0.5564297800059832, "learning_rate": 3.2056658549787513e-06, "loss": 0.5004, "step": 4410 }, { "epoch": 2.294698544698545, "grad_norm": 0.5807459489768877, "learning_rate": 3.1835069008417307e-06, "loss": 0.513, "step": 4415 }, { "epoch": 2.2972972972972974, "grad_norm": 0.5698550653282723, "learning_rate": 3.1614102909729547e-06, "loss": 0.5017, "step": 4420 }, { "epoch": 2.29989604989605, "grad_norm": 0.5484677379859523, "learning_rate": 3.139376227470038e-06, "loss": 0.4948, "step": 4425 }, { "epoch": 2.3024948024948024, "grad_norm": 0.6034586450111454, "learning_rate": 3.1174049118585303e-06, "loss": 0.5057, "step": 4430 }, { "epoch": 2.305093555093555, "grad_norm": 0.6209164341363942, "learning_rate": 3.0954965450900963e-06, "loss": 0.5013, "step": 4435 }, { "epoch": 2.3076923076923075, "grad_norm": 0.608022280956626, "learning_rate": 3.0736513275406565e-06, "loss": 0.5007, "step": 4440 }, { "epoch": 2.3102910602910605, "grad_norm": 0.5822912947800326, "learning_rate": 3.0518694590085608e-06, "loss": 0.4878, "step": 4445 }, { "epoch": 2.312889812889813, "grad_norm": 0.6052596335796735, "learning_rate": 3.0301511387127746e-06, "loss": 0.5048, "step": 4450 }, { "epoch": 2.3154885654885655, "grad_norm": 0.6109257960539891, "learning_rate": 3.0084965652910314e-06, "loss": 0.4979, "step": 4455 }, { "epoch": 2.318087318087318, "grad_norm": 0.6001612610617809, "learning_rate": 2.9869059367980402e-06, "loss": 0.502, "step": 4460 }, { "epoch": 2.3206860706860706, "grad_norm": 0.5748427683895482, "learning_rate": 2.965379450703665e-06, "loss": 0.4976, "step": 4465 }, { "epoch": 2.323284823284823, "grad_norm": 0.6062682998551074, "learning_rate": 2.943917303891107e-06, "loss": 0.51, "step": 4470 }, { "epoch": 2.3258835758835756, "grad_norm": 0.5866563557363672, "learning_rate": 2.92251969265512e-06, "loss": 0.5063, "step": 4475 }, { "epoch": 2.3284823284823286, "grad_norm": 0.5512520483966091, "learning_rate": 2.9011868127002153e-06, "loss": 0.4934, "step": 4480 }, { "epoch": 2.331081081081081, "grad_norm": 0.5427787073773119, "learning_rate": 2.879918859138857e-06, "loss": 0.4909, "step": 4485 }, { "epoch": 2.3336798336798337, "grad_norm": 0.5807057669777462, "learning_rate": 2.8587160264896873e-06, "loss": 0.4955, "step": 4490 }, { "epoch": 2.336278586278586, "grad_norm": 0.6094348111906394, "learning_rate": 2.8375785086757533e-06, "loss": 0.5028, "step": 4495 }, { "epoch": 2.3388773388773387, "grad_norm": 0.569446343993791, "learning_rate": 2.8165064990227255e-06, "loss": 0.4966, "step": 4500 }, { "epoch": 2.3414760914760917, "grad_norm": 0.546949602624272, "learning_rate": 2.795500190257122e-06, "loss": 0.5041, "step": 4505 }, { "epoch": 2.3440748440748442, "grad_norm": 0.5841136870299933, "learning_rate": 2.774559774504566e-06, "loss": 0.5093, "step": 4510 }, { "epoch": 2.3466735966735968, "grad_norm": 0.5894084600218413, "learning_rate": 2.75368544328801e-06, "loss": 0.5018, "step": 4515 }, { "epoch": 2.3492723492723493, "grad_norm": 0.5849212705691518, "learning_rate": 2.7328773875259905e-06, "loss": 0.4983, "step": 4520 }, { "epoch": 2.351871101871102, "grad_norm": 0.5594245456576148, "learning_rate": 2.7121357975308893e-06, "loss": 0.5116, "step": 4525 }, { "epoch": 2.3544698544698544, "grad_norm": 0.5904437980074254, "learning_rate": 2.691460863007178e-06, "loss": 0.5046, "step": 4530 }, { "epoch": 2.357068607068607, "grad_norm": 0.6092333364117684, "learning_rate": 2.670852773049698e-06, "loss": 0.492, "step": 4535 }, { "epoch": 2.35966735966736, "grad_norm": 0.5406949036065258, "learning_rate": 2.6503117161419246e-06, "loss": 0.4966, "step": 4540 }, { "epoch": 2.3622661122661124, "grad_norm": 0.6499059905714683, "learning_rate": 2.6298378801542337e-06, "loss": 0.4995, "step": 4545 }, { "epoch": 2.364864864864865, "grad_norm": 0.5417621572559367, "learning_rate": 2.6094314523422035e-06, "loss": 0.4903, "step": 4550 }, { "epoch": 2.3674636174636174, "grad_norm": 0.5832045594170597, "learning_rate": 2.589092619344885e-06, "loss": 0.4937, "step": 4555 }, { "epoch": 2.37006237006237, "grad_norm": 0.570494106023411, "learning_rate": 2.5688215671830975e-06, "loss": 0.4967, "step": 4560 }, { "epoch": 2.3726611226611225, "grad_norm": 0.5563324888807575, "learning_rate": 2.54861848125774e-06, "loss": 0.5039, "step": 4565 }, { "epoch": 2.375259875259875, "grad_norm": 0.5891719757564269, "learning_rate": 2.5284835463480774e-06, "loss": 0.5009, "step": 4570 }, { "epoch": 2.377858627858628, "grad_norm": 0.6000418457824788, "learning_rate": 2.5084169466100626e-06, "loss": 0.494, "step": 4575 }, { "epoch": 2.3804573804573805, "grad_norm": 0.5612803989317922, "learning_rate": 2.4884188655746554e-06, "loss": 0.4974, "step": 4580 }, { "epoch": 2.383056133056133, "grad_norm": 0.5574484874125388, "learning_rate": 2.468489486146125e-06, "loss": 0.4953, "step": 4585 }, { "epoch": 2.3856548856548856, "grad_norm": 0.550628523258081, "learning_rate": 2.4486289906003935e-06, "loss": 0.5182, "step": 4590 }, { "epoch": 2.388253638253638, "grad_norm": 0.567017209479145, "learning_rate": 2.4288375605833726e-06, "loss": 0.4907, "step": 4595 }, { "epoch": 2.390852390852391, "grad_norm": 0.5474114054711359, "learning_rate": 2.4091153771092847e-06, "loss": 0.4976, "step": 4600 }, { "epoch": 2.3934511434511436, "grad_norm": 0.5567614559206484, "learning_rate": 2.3894626205590177e-06, "loss": 0.4925, "step": 4605 }, { "epoch": 2.396049896049896, "grad_norm": 0.5620691248378288, "learning_rate": 2.36987947067848e-06, "loss": 0.4892, "step": 4610 }, { "epoch": 2.3986486486486487, "grad_norm": 0.5471599595016963, "learning_rate": 2.3503661065769523e-06, "loss": 0.5006, "step": 4615 }, { "epoch": 2.401247401247401, "grad_norm": 0.5643679588409989, "learning_rate": 2.330922706725437e-06, "loss": 0.5052, "step": 4620 }, { "epoch": 2.4038461538461537, "grad_norm": 0.5992107723526578, "learning_rate": 2.3115494489550517e-06, "loss": 0.4944, "step": 4625 }, { "epoch": 2.4064449064449063, "grad_norm": 0.5735681525239322, "learning_rate": 2.292246510455375e-06, "loss": 0.5023, "step": 4630 }, { "epoch": 2.4090436590436592, "grad_norm": 0.5569413415577497, "learning_rate": 2.2730140677728485e-06, "loss": 0.5017, "step": 4635 }, { "epoch": 2.4116424116424118, "grad_norm": 0.5657509769713301, "learning_rate": 2.253852296809148e-06, "loss": 0.5018, "step": 4640 }, { "epoch": 2.4142411642411643, "grad_norm": 0.561092028484337, "learning_rate": 2.234761372819577e-06, "loss": 0.5005, "step": 4645 }, { "epoch": 2.416839916839917, "grad_norm": 0.584135442702734, "learning_rate": 2.215741470411472e-06, "loss": 0.495, "step": 4650 }, { "epoch": 2.4194386694386694, "grad_norm": 0.5585660724073979, "learning_rate": 2.196792763542599e-06, "loss": 0.5045, "step": 4655 }, { "epoch": 2.422037422037422, "grad_norm": 0.5584867361238677, "learning_rate": 2.1779154255195576e-06, "loss": 0.5018, "step": 4660 }, { "epoch": 2.4246361746361744, "grad_norm": 0.566982522139209, "learning_rate": 2.1591096289962077e-06, "loss": 0.4911, "step": 4665 }, { "epoch": 2.4272349272349274, "grad_norm": 0.560220035509712, "learning_rate": 2.140375545972081e-06, "loss": 0.5021, "step": 4670 }, { "epoch": 2.42983367983368, "grad_norm": 0.5507532159687185, "learning_rate": 2.121713347790808e-06, "loss": 0.5036, "step": 4675 }, { "epoch": 2.4324324324324325, "grad_norm": 0.5966472596819247, "learning_rate": 2.1031232051385606e-06, "loss": 0.4966, "step": 4680 }, { "epoch": 2.435031185031185, "grad_norm": 0.5544285219883713, "learning_rate": 2.0846052880424783e-06, "loss": 0.501, "step": 4685 }, { "epoch": 2.4376299376299375, "grad_norm": 0.5182057167941686, "learning_rate": 2.0661597658691226e-06, "loss": 0.4904, "step": 4690 }, { "epoch": 2.44022869022869, "grad_norm": 0.5694712994846337, "learning_rate": 2.047786807322927e-06, "loss": 0.4875, "step": 4695 }, { "epoch": 2.442827442827443, "grad_norm": 0.5644710268706207, "learning_rate": 2.029486580444644e-06, "loss": 0.4919, "step": 4700 }, { "epoch": 2.4454261954261955, "grad_norm": 0.5709123415197537, "learning_rate": 2.0112592526098173e-06, "loss": 0.5087, "step": 4705 }, { "epoch": 2.448024948024948, "grad_norm": 0.5535461175978135, "learning_rate": 1.993104990527257e-06, "loss": 0.4921, "step": 4710 }, { "epoch": 2.4506237006237006, "grad_norm": 0.6301006821974645, "learning_rate": 1.975023960237499e-06, "loss": 0.4885, "step": 4715 }, { "epoch": 2.453222453222453, "grad_norm": 0.5494288053608467, "learning_rate": 1.957016327111294e-06, "loss": 0.4906, "step": 4720 }, { "epoch": 2.4558212058212057, "grad_norm": 0.5493564158683376, "learning_rate": 1.9390822558481014e-06, "loss": 0.4955, "step": 4725 }, { "epoch": 2.4584199584199586, "grad_norm": 0.6043307426388902, "learning_rate": 1.921221910474579e-06, "loss": 0.5007, "step": 4730 }, { "epoch": 2.461018711018711, "grad_norm": 0.5909390472872661, "learning_rate": 1.9034354543430677e-06, "loss": 0.5009, "step": 4735 }, { "epoch": 2.4636174636174637, "grad_norm": 0.5601290888435961, "learning_rate": 1.885723050130127e-06, "loss": 0.4869, "step": 4740 }, { "epoch": 2.4662162162162162, "grad_norm": 0.5671638427007798, "learning_rate": 1.8680848598350165e-06, "loss": 0.5002, "step": 4745 }, { "epoch": 2.4688149688149688, "grad_norm": 0.5597631082866084, "learning_rate": 1.8505210447782418e-06, "loss": 0.5092, "step": 4750 }, { "epoch": 2.4714137214137213, "grad_norm": 0.5611497450799863, "learning_rate": 1.833031765600054e-06, "loss": 0.5008, "step": 4755 }, { "epoch": 2.474012474012474, "grad_norm": 0.5601559085266762, "learning_rate": 1.8156171822589963e-06, "loss": 0.4887, "step": 4760 }, { "epoch": 2.476611226611227, "grad_norm": 0.555263493680061, "learning_rate": 1.7982774540304404e-06, "loss": 0.5112, "step": 4765 }, { "epoch": 2.4792099792099793, "grad_norm": 0.5663743347641695, "learning_rate": 1.781012739505127e-06, "loss": 0.4907, "step": 4770 }, { "epoch": 2.481808731808732, "grad_norm": 0.6155955922535356, "learning_rate": 1.7638231965877039e-06, "loss": 0.4836, "step": 4775 }, { "epoch": 2.4844074844074844, "grad_norm": 0.5902555495646782, "learning_rate": 1.7467089824953077e-06, "loss": 0.5047, "step": 4780 }, { "epoch": 2.487006237006237, "grad_norm": 0.5720398120641105, "learning_rate": 1.7296702537560994e-06, "loss": 0.5094, "step": 4785 }, { "epoch": 2.4896049896049894, "grad_norm": 0.5593330846808308, "learning_rate": 1.7127071662078455e-06, "loss": 0.5121, "step": 4790 }, { "epoch": 2.492203742203742, "grad_norm": 0.5807674813382018, "learning_rate": 1.6958198749964983e-06, "loss": 0.4888, "step": 4795 }, { "epoch": 2.494802494802495, "grad_norm": 0.5712031491060828, "learning_rate": 1.679008534574761e-06, "loss": 0.485, "step": 4800 }, { "epoch": 2.4974012474012475, "grad_norm": 0.5616832705475885, "learning_rate": 1.6622732987006884e-06, "loss": 0.5019, "step": 4805 }, { "epoch": 2.5, "grad_norm": 0.5536996546706574, "learning_rate": 1.6456143204362807e-06, "loss": 0.4933, "step": 4810 }, { "epoch": 2.5025987525987525, "grad_norm": 0.5707614937226522, "learning_rate": 1.6290317521460697e-06, "loss": 0.4828, "step": 4815 }, { "epoch": 2.505197505197505, "grad_norm": 0.5927994991308208, "learning_rate": 1.6125257454957365e-06, "loss": 0.4861, "step": 4820 }, { "epoch": 2.507796257796258, "grad_norm": 0.5852679815841081, "learning_rate": 1.5960964514507316e-06, "loss": 0.4944, "step": 4825 }, { "epoch": 2.51039501039501, "grad_norm": 0.5782206531686512, "learning_rate": 1.5797440202748748e-06, "loss": 0.4897, "step": 4830 }, { "epoch": 2.512993762993763, "grad_norm": 0.5749262146519877, "learning_rate": 1.5634686015289925e-06, "loss": 0.5008, "step": 4835 }, { "epoch": 2.5155925155925156, "grad_norm": 0.5902984761192304, "learning_rate": 1.5472703440695524e-06, "loss": 0.4997, "step": 4840 }, { "epoch": 2.518191268191268, "grad_norm": 0.5725171354203544, "learning_rate": 1.5311493960472978e-06, "loss": 0.4913, "step": 4845 }, { "epoch": 2.5207900207900207, "grad_norm": 0.5489936561056176, "learning_rate": 1.5151059049058913e-06, "loss": 0.4965, "step": 4850 }, { "epoch": 2.523388773388773, "grad_norm": 0.571188356733997, "learning_rate": 1.499140017380566e-06, "loss": 0.4955, "step": 4855 }, { "epoch": 2.525987525987526, "grad_norm": 0.543934249979962, "learning_rate": 1.4832518794967853e-06, "loss": 0.498, "step": 4860 }, { "epoch": 2.5285862785862787, "grad_norm": 0.5779586017866482, "learning_rate": 1.4674416365689137e-06, "loss": 0.5079, "step": 4865 }, { "epoch": 2.5311850311850312, "grad_norm": 0.573790412674796, "learning_rate": 1.4517094331988734e-06, "loss": 0.5071, "step": 4870 }, { "epoch": 2.5337837837837838, "grad_norm": 0.5834488347165243, "learning_rate": 1.4360554132748305e-06, "loss": 0.493, "step": 4875 }, { "epoch": 2.5363825363825363, "grad_norm": 0.5521193528499587, "learning_rate": 1.4204797199698839e-06, "loss": 0.4893, "step": 4880 }, { "epoch": 2.538981288981289, "grad_norm": 0.5837955107251298, "learning_rate": 1.4049824957407464e-06, "loss": 0.4998, "step": 4885 }, { "epoch": 2.5415800415800414, "grad_norm": 0.5654955515661542, "learning_rate": 1.3895638823264447e-06, "loss": 0.4913, "step": 4890 }, { "epoch": 2.5441787941787943, "grad_norm": 0.5743033149419415, "learning_rate": 1.374224020747027e-06, "loss": 0.5056, "step": 4895 }, { "epoch": 2.546777546777547, "grad_norm": 0.5855881014618302, "learning_rate": 1.3589630513022656e-06, "loss": 0.5028, "step": 4900 }, { "epoch": 2.5493762993762994, "grad_norm": 0.6031010192364838, "learning_rate": 1.3437811135703792e-06, "loss": 0.4964, "step": 4905 }, { "epoch": 2.551975051975052, "grad_norm": 0.5409820031001269, "learning_rate": 1.328678346406761e-06, "loss": 0.4946, "step": 4910 }, { "epoch": 2.5545738045738045, "grad_norm": 0.5667214248558752, "learning_rate": 1.3136548879426926e-06, "loss": 0.492, "step": 4915 }, { "epoch": 2.5571725571725574, "grad_norm": 0.5478082998559753, "learning_rate": 1.2987108755840994e-06, "loss": 0.4949, "step": 4920 }, { "epoch": 2.5597713097713095, "grad_norm": 0.5748275704846928, "learning_rate": 1.2838464460102862e-06, "loss": 0.4969, "step": 4925 }, { "epoch": 2.5623700623700625, "grad_norm": 0.5561105303734099, "learning_rate": 1.2690617351726798e-06, "loss": 0.4967, "step": 4930 }, { "epoch": 2.564968814968815, "grad_norm": 0.5847660828596739, "learning_rate": 1.2543568782935933e-06, "loss": 0.4893, "step": 4935 }, { "epoch": 2.5675675675675675, "grad_norm": 0.5797822737989639, "learning_rate": 1.2397320098649957e-06, "loss": 0.5002, "step": 4940 }, { "epoch": 2.57016632016632, "grad_norm": 0.5696211912101424, "learning_rate": 1.225187263647265e-06, "loss": 0.5056, "step": 4945 }, { "epoch": 2.5727650727650726, "grad_norm": 0.6105509252737591, "learning_rate": 1.210722772667977e-06, "loss": 0.4786, "step": 4950 }, { "epoch": 2.5753638253638256, "grad_norm": 0.5710521831184937, "learning_rate": 1.196338669220689e-06, "loss": 0.4895, "step": 4955 }, { "epoch": 2.577962577962578, "grad_norm": 0.5601653499624455, "learning_rate": 1.182035084863724e-06, "loss": 0.5016, "step": 4960 }, { "epoch": 2.5805613305613306, "grad_norm": 0.584343919584128, "learning_rate": 1.167812150418972e-06, "loss": 0.5159, "step": 4965 }, { "epoch": 2.583160083160083, "grad_norm": 0.6129296779221889, "learning_rate": 1.1536699959706898e-06, "loss": 0.5055, "step": 4970 }, { "epoch": 2.5857588357588357, "grad_norm": 0.5844416957330778, "learning_rate": 1.1396087508643106e-06, "loss": 0.504, "step": 4975 }, { "epoch": 2.5883575883575882, "grad_norm": 0.5750509882184978, "learning_rate": 1.1256285437052684e-06, "loss": 0.4925, "step": 4980 }, { "epoch": 2.5909563409563408, "grad_norm": 0.5486207493135079, "learning_rate": 1.1117295023578134e-06, "loss": 0.5079, "step": 4985 }, { "epoch": 2.5935550935550937, "grad_norm": 0.5751013655385914, "learning_rate": 1.0979117539438444e-06, "loss": 0.4925, "step": 4990 }, { "epoch": 2.5961538461538463, "grad_norm": 0.5940615821819871, "learning_rate": 1.0841754248417535e-06, "loss": 0.5001, "step": 4995 }, { "epoch": 2.598752598752599, "grad_norm": 0.544341637873671, "learning_rate": 1.0705206406852607e-06, "loss": 0.5003, "step": 5000 }, { "epoch": 2.6013513513513513, "grad_norm": 0.5688429085624325, "learning_rate": 1.0569475263622652e-06, "loss": 0.492, "step": 5005 }, { "epoch": 2.603950103950104, "grad_norm": 0.5898916948570275, "learning_rate": 1.0434562060137154e-06, "loss": 0.494, "step": 5010 }, { "epoch": 2.606548856548857, "grad_norm": 0.5415866533855809, "learning_rate": 1.030046803032455e-06, "loss": 0.4904, "step": 5015 }, { "epoch": 2.609147609147609, "grad_norm": 0.5719708739269925, "learning_rate": 1.0167194400621072e-06, "loss": 0.489, "step": 5020 }, { "epoch": 2.611746361746362, "grad_norm": 0.5958211082907041, "learning_rate": 1.003474238995954e-06, "loss": 0.4957, "step": 5025 }, { "epoch": 2.6143451143451144, "grad_norm": 0.5541558159414748, "learning_rate": 9.903113209758098e-07, "loss": 0.4993, "step": 5030 }, { "epoch": 2.616943866943867, "grad_norm": 0.5872960411024288, "learning_rate": 9.772308063909263e-07, "loss": 0.5105, "step": 5035 }, { "epoch": 2.6195426195426195, "grad_norm": 0.5605189291423912, "learning_rate": 9.642328148768865e-07, "loss": 0.4963, "step": 5040 }, { "epoch": 2.622141372141372, "grad_norm": 0.5607585059207263, "learning_rate": 9.513174653145052e-07, "loss": 0.5028, "step": 5045 }, { "epoch": 2.624740124740125, "grad_norm": 0.5534539551699679, "learning_rate": 9.384848758287469e-07, "loss": 0.4894, "step": 5050 }, { "epoch": 2.6273388773388775, "grad_norm": 0.5668591204471706, "learning_rate": 9.25735163787651e-07, "loss": 0.5004, "step": 5055 }, { "epoch": 2.62993762993763, "grad_norm": 0.5428488027300683, "learning_rate": 9.13068445801244e-07, "loss": 0.5028, "step": 5060 }, { "epoch": 2.6325363825363826, "grad_norm": 0.5730809692935364, "learning_rate": 9.004848377204878e-07, "loss": 0.4961, "step": 5065 }, { "epoch": 2.635135135135135, "grad_norm": 0.5287247837670602, "learning_rate": 8.879844546362093e-07, "loss": 0.499, "step": 5070 }, { "epoch": 2.6377338877338876, "grad_norm": 0.5730710423541429, "learning_rate": 8.755674108780532e-07, "loss": 0.4964, "step": 5075 }, { "epoch": 2.64033264033264, "grad_norm": 0.5506264010952827, "learning_rate": 8.632338200134382e-07, "loss": 0.4936, "step": 5080 }, { "epoch": 2.642931392931393, "grad_norm": 0.519210055930055, "learning_rate": 8.509837948465094e-07, "loss": 0.49, "step": 5085 }, { "epoch": 2.6455301455301456, "grad_norm": 0.5809039207230658, "learning_rate": 8.388174474171163e-07, "loss": 0.5033, "step": 5090 }, { "epoch": 2.648128898128898, "grad_norm": 0.561018096449011, "learning_rate": 8.267348889997839e-07, "loss": 0.5051, "step": 5095 }, { "epoch": 2.6507276507276507, "grad_norm": 0.5422624927838648, "learning_rate": 8.14736230102694e-07, "loss": 0.4864, "step": 5100 }, { "epoch": 2.6533264033264032, "grad_norm": 0.549824075528394, "learning_rate": 8.028215804666761e-07, "loss": 0.5027, "step": 5105 }, { "epoch": 2.6559251559251558, "grad_norm": 0.6073556177013598, "learning_rate": 7.909910490642025e-07, "loss": 0.4981, "step": 5110 }, { "epoch": 2.6585239085239083, "grad_norm": 0.5812550130344551, "learning_rate": 7.792447440983985e-07, "loss": 0.504, "step": 5115 }, { "epoch": 2.6611226611226613, "grad_norm": 0.5688133990130678, "learning_rate": 7.675827730020358e-07, "loss": 0.5004, "step": 5120 }, { "epoch": 2.663721413721414, "grad_norm": 0.5617035595950866, "learning_rate": 7.560052424365716e-07, "loss": 0.4923, "step": 5125 }, { "epoch": 2.6663201663201663, "grad_norm": 0.5835381005107588, "learning_rate": 7.445122582911546e-07, "loss": 0.4989, "step": 5130 }, { "epoch": 2.668918918918919, "grad_norm": 0.5681826093882452, "learning_rate": 7.331039256816664e-07, "loss": 0.5001, "step": 5135 }, { "epoch": 2.6715176715176714, "grad_norm": 0.5416547579730493, "learning_rate": 7.217803489497621e-07, "loss": 0.4915, "step": 5140 }, { "epoch": 2.6741164241164244, "grad_norm": 0.5708948503374369, "learning_rate": 7.10541631661904e-07, "loss": 0.506, "step": 5145 }, { "epoch": 2.6767151767151764, "grad_norm": 0.5825889025850369, "learning_rate": 6.993878766084295e-07, "loss": 0.4978, "step": 5150 }, { "epoch": 2.6793139293139294, "grad_norm": 0.5658766831235557, "learning_rate": 6.883191858026006e-07, "loss": 0.5002, "step": 5155 }, { "epoch": 2.681912681912682, "grad_norm": 0.5560529305298989, "learning_rate": 6.773356604796744e-07, "loss": 0.4975, "step": 5160 }, { "epoch": 2.6845114345114345, "grad_norm": 0.5708450386103079, "learning_rate": 6.664374010959739e-07, "loss": 0.5089, "step": 5165 }, { "epoch": 2.687110187110187, "grad_norm": 0.5562285971352838, "learning_rate": 6.556245073279777e-07, "loss": 0.5075, "step": 5170 }, { "epoch": 2.6897089397089395, "grad_norm": 0.5721605688382857, "learning_rate": 6.448970780713948e-07, "loss": 0.4876, "step": 5175 }, { "epoch": 2.6923076923076925, "grad_norm": 0.567860262795361, "learning_rate": 6.342552114402789e-07, "loss": 0.4968, "step": 5180 }, { "epoch": 2.694906444906445, "grad_norm": 0.5512124172540173, "learning_rate": 6.236990047661074e-07, "loss": 0.4971, "step": 5185 }, { "epoch": 2.6975051975051976, "grad_norm": 0.5567955011645962, "learning_rate": 6.132285545969141e-07, "loss": 0.4893, "step": 5190 }, { "epoch": 2.70010395010395, "grad_norm": 0.5658501671925406, "learning_rate": 6.028439566963929e-07, "loss": 0.4899, "step": 5195 }, { "epoch": 2.7027027027027026, "grad_norm": 0.5444634054315433, "learning_rate": 5.925453060430219e-07, "loss": 0.4878, "step": 5200 }, { "epoch": 2.705301455301455, "grad_norm": 0.550663548661878, "learning_rate": 5.823326968292009e-07, "loss": 0.5009, "step": 5205 }, { "epoch": 2.7079002079002077, "grad_norm": 0.6060107034007801, "learning_rate": 5.722062224603886e-07, "loss": 0.4946, "step": 5210 }, { "epoch": 2.7104989604989607, "grad_norm": 0.58216821945967, "learning_rate": 5.621659755542408e-07, "loss": 0.5057, "step": 5215 }, { "epoch": 2.713097713097713, "grad_norm": 0.5416674185051638, "learning_rate": 5.522120479397731e-07, "loss": 0.4965, "step": 5220 }, { "epoch": 2.7156964656964657, "grad_norm": 0.5761995130950316, "learning_rate": 5.423445306565168e-07, "loss": 0.5038, "step": 5225 }, { "epoch": 2.7182952182952183, "grad_norm": 0.5635042371421582, "learning_rate": 5.325635139536867e-07, "loss": 0.4884, "step": 5230 }, { "epoch": 2.720893970893971, "grad_norm": 0.5743033588993577, "learning_rate": 5.228690872893527e-07, "loss": 0.4934, "step": 5235 }, { "epoch": 2.7234927234927238, "grad_norm": 0.5431291593888027, "learning_rate": 5.132613393296293e-07, "loss": 0.4921, "step": 5240 }, { "epoch": 2.726091476091476, "grad_norm": 0.5702390465003064, "learning_rate": 5.037403579478551e-07, "loss": 0.5067, "step": 5245 }, { "epoch": 2.728690228690229, "grad_norm": 0.5864949506182338, "learning_rate": 4.943062302237922e-07, "loss": 0.5047, "step": 5250 }, { "epoch": 2.7312889812889813, "grad_norm": 0.5856655357457804, "learning_rate": 4.849590424428386e-07, "loss": 0.498, "step": 5255 }, { "epoch": 2.733887733887734, "grad_norm": 0.5592585991123705, "learning_rate": 4.7569888009522336e-07, "loss": 0.5062, "step": 5260 }, { "epoch": 2.7364864864864864, "grad_norm": 0.5598975614142522, "learning_rate": 4.665258278752383e-07, "loss": 0.4922, "step": 5265 }, { "epoch": 2.739085239085239, "grad_norm": 0.5672657648159654, "learning_rate": 4.574399696804588e-07, "loss": 0.5032, "step": 5270 }, { "epoch": 2.741683991683992, "grad_norm": 0.5635739403935113, "learning_rate": 4.4844138861096954e-07, "loss": 0.4914, "step": 5275 }, { "epoch": 2.7442827442827444, "grad_norm": 0.5707341919153839, "learning_rate": 4.3953016696861805e-07, "loss": 0.4955, "step": 5280 }, { "epoch": 2.746881496881497, "grad_norm": 0.593682622033041, "learning_rate": 4.3070638625624884e-07, "loss": 0.504, "step": 5285 }, { "epoch": 2.7494802494802495, "grad_norm": 0.5308025960734446, "learning_rate": 4.2197012717696604e-07, "loss": 0.4898, "step": 5290 }, { "epoch": 2.752079002079002, "grad_norm": 0.562474477928597, "learning_rate": 4.133214696333943e-07, "loss": 0.4919, "step": 5295 }, { "epoch": 2.7546777546777546, "grad_norm": 0.5626677048136434, "learning_rate": 4.047604927269433e-07, "loss": 0.5041, "step": 5300 }, { "epoch": 2.757276507276507, "grad_norm": 0.5534559012860586, "learning_rate": 3.9628727475709003e-07, "loss": 0.5018, "step": 5305 }, { "epoch": 2.75987525987526, "grad_norm": 0.5678002530448841, "learning_rate": 3.879018932206624e-07, "loss": 0.4795, "step": 5310 }, { "epoch": 2.7624740124740126, "grad_norm": 0.5725188711773384, "learning_rate": 3.796044248111219e-07, "loss": 0.4825, "step": 5315 }, { "epoch": 2.765072765072765, "grad_norm": 0.5449981795766418, "learning_rate": 3.7139494541787225e-07, "loss": 0.4966, "step": 5320 }, { "epoch": 2.7676715176715176, "grad_norm": 0.5793024671746052, "learning_rate": 3.632735301255652e-07, "loss": 0.499, "step": 5325 }, { "epoch": 2.77027027027027, "grad_norm": 0.5463699816730897, "learning_rate": 3.552402532134014e-07, "loss": 0.4971, "step": 5330 }, { "epoch": 2.7728690228690227, "grad_norm": 0.569435124360503, "learning_rate": 3.472951881544695e-07, "loss": 0.4965, "step": 5335 }, { "epoch": 2.7754677754677752, "grad_norm": 0.5426897702433433, "learning_rate": 3.3943840761505695e-07, "loss": 0.5109, "step": 5340 }, { "epoch": 2.778066528066528, "grad_norm": 0.5583873172184759, "learning_rate": 3.316699834539983e-07, "loss": 0.5025, "step": 5345 }, { "epoch": 2.7806652806652807, "grad_norm": 0.589354529655944, "learning_rate": 3.239899867220064e-07, "loss": 0.4998, "step": 5350 }, { "epoch": 2.7832640332640333, "grad_norm": 0.5254097934455335, "learning_rate": 3.163984876610371e-07, "loss": 0.4949, "step": 5355 }, { "epoch": 2.785862785862786, "grad_norm": 0.5536550483370661, "learning_rate": 3.0889555570363216e-07, "loss": 0.4917, "step": 5360 }, { "epoch": 2.7884615384615383, "grad_norm": 0.5994721180940217, "learning_rate": 3.0148125947229047e-07, "loss": 0.495, "step": 5365 }, { "epoch": 2.7910602910602913, "grad_norm": 0.5652212087251041, "learning_rate": 2.9415566677884365e-07, "loss": 0.5029, "step": 5370 }, { "epoch": 2.7936590436590434, "grad_norm": 0.567605739930232, "learning_rate": 2.869188446238336e-07, "loss": 0.506, "step": 5375 }, { "epoch": 2.7962577962577964, "grad_norm": 0.5569576384780233, "learning_rate": 2.7977085919589253e-07, "loss": 0.5003, "step": 5380 }, { "epoch": 2.798856548856549, "grad_norm": 0.5412543330665912, "learning_rate": 2.727117758711506e-07, "loss": 0.4887, "step": 5385 }, { "epoch": 2.8014553014553014, "grad_norm": 0.5376966982466084, "learning_rate": 2.6574165921262605e-07, "loss": 0.4888, "step": 5390 }, { "epoch": 2.804054054054054, "grad_norm": 0.54053951299071, "learning_rate": 2.588605729696447e-07, "loss": 0.4919, "step": 5395 }, { "epoch": 2.8066528066528065, "grad_norm": 0.5981753062988322, "learning_rate": 2.5206858007724934e-07, "loss": 0.4839, "step": 5400 }, { "epoch": 2.8092515592515594, "grad_norm": 0.5725431316908658, "learning_rate": 2.453657426556244e-07, "loss": 0.5122, "step": 5405 }, { "epoch": 2.811850311850312, "grad_norm": 0.5422874879244404, "learning_rate": 2.387521220095357e-07, "loss": 0.4891, "step": 5410 }, { "epoch": 2.8144490644490645, "grad_norm": 0.5599975123926269, "learning_rate": 2.3222777862776046e-07, "loss": 0.5021, "step": 5415 }, { "epoch": 2.817047817047817, "grad_norm": 0.5590054648939673, "learning_rate": 2.2579277218253926e-07, "loss": 0.4841, "step": 5420 }, { "epoch": 2.8196465696465696, "grad_norm": 0.5504364503745305, "learning_rate": 2.1944716152902834e-07, "loss": 0.5002, "step": 5425 }, { "epoch": 2.822245322245322, "grad_norm": 0.5797812317339487, "learning_rate": 2.131910047047625e-07, "loss": 0.486, "step": 5430 }, { "epoch": 2.8248440748440746, "grad_norm": 0.5584561563327431, "learning_rate": 2.070243589291221e-07, "loss": 0.4879, "step": 5435 }, { "epoch": 2.8274428274428276, "grad_norm": 0.5983591006728118, "learning_rate": 2.0094728060281454e-07, "loss": 0.4964, "step": 5440 }, { "epoch": 2.83004158004158, "grad_norm": 0.5793345159708853, "learning_rate": 1.9495982530735035e-07, "loss": 0.4931, "step": 5445 }, { "epoch": 2.8326403326403327, "grad_norm": 0.5521618537806441, "learning_rate": 1.890620478045435e-07, "loss": 0.4844, "step": 5450 }, { "epoch": 2.835239085239085, "grad_norm": 0.5590622081741721, "learning_rate": 1.832540020360063e-07, "loss": 0.4941, "step": 5455 }, { "epoch": 2.8378378378378377, "grad_norm": 0.5807487335701172, "learning_rate": 1.7753574112265526e-07, "loss": 0.4888, "step": 5460 }, { "epoch": 2.8404365904365907, "grad_norm": 0.5896137562543345, "learning_rate": 1.7190731736422606e-07, "loss": 0.4983, "step": 5465 }, { "epoch": 2.8430353430353428, "grad_norm": 0.5740425049161126, "learning_rate": 1.6636878223879826e-07, "loss": 0.4931, "step": 5470 }, { "epoch": 2.8456340956340958, "grad_norm": 0.5440884316639071, "learning_rate": 1.6092018640231688e-07, "loss": 0.4831, "step": 5475 }, { "epoch": 2.8482328482328483, "grad_norm": 0.5781760225759857, "learning_rate": 1.5556157968813823e-07, "loss": 0.4988, "step": 5480 }, { "epoch": 2.850831600831601, "grad_norm": 0.5889448479356277, "learning_rate": 1.5029301110656923e-07, "loss": 0.4885, "step": 5485 }, { "epoch": 2.8534303534303533, "grad_norm": 0.6111223279448279, "learning_rate": 1.4511452884441778e-07, "loss": 0.5014, "step": 5490 }, { "epoch": 2.856029106029106, "grad_norm": 0.5497820384951762, "learning_rate": 1.400261802645575e-07, "loss": 0.4951, "step": 5495 }, { "epoch": 2.858627858627859, "grad_norm": 0.560040257401693, "learning_rate": 1.350280119054881e-07, "loss": 0.4907, "step": 5500 }, { "epoch": 2.8612266112266114, "grad_norm": 0.5832689907805786, "learning_rate": 1.3012006948091237e-07, "loss": 0.4919, "step": 5505 }, { "epoch": 2.863825363825364, "grad_norm": 0.5550041536720619, "learning_rate": 1.2530239787932108e-07, "loss": 0.4841, "step": 5510 }, { "epoch": 2.8664241164241164, "grad_norm": 0.5548285935264545, "learning_rate": 1.2057504116357865e-07, "loss": 0.4957, "step": 5515 }, { "epoch": 2.869022869022869, "grad_norm": 0.5573963669652322, "learning_rate": 1.1593804257052143e-07, "loss": 0.5003, "step": 5520 }, { "epoch": 2.8716216216216215, "grad_norm": 0.573683837072622, "learning_rate": 1.1139144451056016e-07, "loss": 0.4917, "step": 5525 }, { "epoch": 2.874220374220374, "grad_norm": 0.5955060126211607, "learning_rate": 1.0693528856729918e-07, "loss": 0.5077, "step": 5530 }, { "epoch": 2.876819126819127, "grad_norm": 0.5823077256056483, "learning_rate": 1.025696154971445e-07, "loss": 0.4879, "step": 5535 }, { "epoch": 2.8794178794178795, "grad_norm": 0.578110542140886, "learning_rate": 9.829446522894193e-08, "loss": 0.5007, "step": 5540 }, { "epoch": 2.882016632016632, "grad_norm": 0.5745290388769638, "learning_rate": 9.410987686360618e-08, "loss": 0.4846, "step": 5545 }, { "epoch": 2.8846153846153846, "grad_norm": 0.5645029409864777, "learning_rate": 9.001588867376343e-08, "loss": 0.4875, "step": 5550 }, { "epoch": 2.887214137214137, "grad_norm": 0.5579791649018835, "learning_rate": 8.601253810340493e-08, "loss": 0.498, "step": 5555 }, { "epoch": 2.88981288981289, "grad_norm": 0.5352826063441829, "learning_rate": 8.209986176753947e-08, "loss": 0.4929, "step": 5560 }, { "epoch": 2.892411642411642, "grad_norm": 0.5406028586508593, "learning_rate": 7.827789545186149e-08, "loss": 0.493, "step": 5565 }, { "epoch": 2.895010395010395, "grad_norm": 0.5735179042030664, "learning_rate": 7.454667411242677e-08, "loss": 0.4974, "step": 5570 }, { "epoch": 2.8976091476091477, "grad_norm": 0.5884667672124062, "learning_rate": 7.090623187532286e-08, "loss": 0.4979, "step": 5575 }, { "epoch": 2.9002079002079, "grad_norm": 0.5483035016900611, "learning_rate": 6.735660203636918e-08, "loss": 0.4905, "step": 5580 }, { "epoch": 2.9028066528066527, "grad_norm": 0.5528971829398494, "learning_rate": 6.389781706080289e-08, "loss": 0.5122, "step": 5585 }, { "epoch": 2.9054054054054053, "grad_norm": 0.6057996286720458, "learning_rate": 6.052990858298801e-08, "loss": 0.5028, "step": 5590 }, { "epoch": 2.9080041580041582, "grad_norm": 0.5791623231565692, "learning_rate": 5.7252907406123436e-08, "loss": 0.4982, "step": 5595 }, { "epoch": 2.9106029106029108, "grad_norm": 0.5627119775940129, "learning_rate": 5.406684350195979e-08, "loss": 0.4964, "step": 5600 }, { "epoch": 2.9132016632016633, "grad_norm": 0.5559095452523849, "learning_rate": 5.0971746010528566e-08, "loss": 0.5063, "step": 5605 }, { "epoch": 2.915800415800416, "grad_norm": 0.5535048888740743, "learning_rate": 4.7967643239875686e-08, "loss": 0.501, "step": 5610 }, { "epoch": 2.9183991683991684, "grad_norm": 0.5462617861557779, "learning_rate": 4.505456266579833e-08, "loss": 0.5031, "step": 5615 }, { "epoch": 2.920997920997921, "grad_norm": 0.5384884238791128, "learning_rate": 4.22325309315963e-08, "loss": 0.5019, "step": 5620 }, { "epoch": 2.9235966735966734, "grad_norm": 0.5773761719166159, "learning_rate": 3.950157384783104e-08, "loss": 0.4939, "step": 5625 }, { "epoch": 2.9261954261954264, "grad_norm": 0.5595211401825231, "learning_rate": 3.68617163920848e-08, "loss": 0.5007, "step": 5630 }, { "epoch": 2.928794178794179, "grad_norm": 0.5658405364960404, "learning_rate": 3.4312982708734065e-08, "loss": 0.4806, "step": 5635 }, { "epoch": 2.9313929313929314, "grad_norm": 0.5503957964422638, "learning_rate": 3.1855396108730897e-08, "loss": 0.5014, "step": 5640 }, { "epoch": 2.933991683991684, "grad_norm": 0.5453959548244628, "learning_rate": 2.9488979069387523e-08, "loss": 0.4894, "step": 5645 }, { "epoch": 2.9365904365904365, "grad_norm": 0.5693202599680172, "learning_rate": 2.721375323416875e-08, "loss": 0.4966, "step": 5650 }, { "epoch": 2.939189189189189, "grad_norm": 0.5709168022581379, "learning_rate": 2.5029739412497643e-08, "loss": 0.4887, "step": 5655 }, { "epoch": 2.9417879417879416, "grad_norm": 0.5402927981587441, "learning_rate": 2.293695757956571e-08, "loss": 0.4968, "step": 5660 }, { "epoch": 2.9443866943866945, "grad_norm": 0.629131836957027, "learning_rate": 2.0935426876144138e-08, "loss": 0.4891, "step": 5665 }, { "epoch": 2.946985446985447, "grad_norm": 0.5667833955309428, "learning_rate": 1.9025165608418382e-08, "loss": 0.4975, "step": 5670 }, { "epoch": 2.9495841995841996, "grad_norm": 0.5618681057012622, "learning_rate": 1.7206191247810533e-08, "loss": 0.4949, "step": 5675 }, { "epoch": 2.952182952182952, "grad_norm": 0.5467732554533196, "learning_rate": 1.5478520430826095e-08, "loss": 0.4985, "step": 5680 }, { "epoch": 2.9547817047817047, "grad_norm": 0.566899525327703, "learning_rate": 1.3842168958900782e-08, "loss": 0.4978, "step": 5685 }, { "epoch": 2.9573804573804576, "grad_norm": 0.548674729535616, "learning_rate": 1.229715179825397e-08, "loss": 0.5092, "step": 5690 }, { "epoch": 2.9599792099792097, "grad_norm": 0.5855706553975555, "learning_rate": 1.0843483079755468e-08, "loss": 0.5036, "step": 5695 }, { "epoch": 2.9625779625779627, "grad_norm": 0.574837155061116, "learning_rate": 9.481176098788958e-09, "loss": 0.5036, "step": 5700 }, { "epoch": 2.965176715176715, "grad_norm": 0.5658141647440329, "learning_rate": 8.210243315140976e-09, "loss": 0.4972, "step": 5705 }, { "epoch": 2.9677754677754677, "grad_norm": 0.5791629723335482, "learning_rate": 7.030696352878786e-09, "loss": 0.4942, "step": 5710 }, { "epoch": 2.9703742203742203, "grad_norm": 0.5707304439471226, "learning_rate": 5.942546000244909e-09, "loss": 0.4946, "step": 5715 }, { "epoch": 2.972972972972973, "grad_norm": 0.5873811028122294, "learning_rate": 4.945802209562755e-09, "loss": 0.4899, "step": 5720 }, { "epoch": 2.975571725571726, "grad_norm": 0.5626457291531141, "learning_rate": 4.0404740971433655e-09, "loss": 0.4837, "step": 5725 }, { "epoch": 2.9781704781704783, "grad_norm": 0.5482498449859082, "learning_rate": 3.226569943197699e-09, "loss": 0.4958, "step": 5730 }, { "epoch": 2.980769230769231, "grad_norm": 0.57584453111696, "learning_rate": 2.5040971917689172e-09, "loss": 0.5065, "step": 5735 }, { "epoch": 2.9833679833679834, "grad_norm": 0.6193525019087672, "learning_rate": 1.873062450659102e-09, "loss": 0.4947, "step": 5740 }, { "epoch": 2.985966735966736, "grad_norm": 0.5534322935945964, "learning_rate": 1.3334714913681989e-09, "loss": 0.4968, "step": 5745 }, { "epoch": 2.9885654885654884, "grad_norm": 0.5698273659048839, "learning_rate": 8.853292490462739e-10, "loss": 0.4965, "step": 5750 }, { "epoch": 2.991164241164241, "grad_norm": 0.6072011011211793, "learning_rate": 5.286398224413347e-10, "loss": 0.4982, "step": 5755 }, { "epoch": 2.993762993762994, "grad_norm": 0.5483495272455329, "learning_rate": 2.6340647386935426e-10, "loss": 0.4905, "step": 5760 }, { "epoch": 2.9963617463617465, "grad_norm": 0.5468214431100177, "learning_rate": 8.963162917763335e-11, "loss": 0.4943, "step": 5765 }, { "epoch": 2.998960498960499, "grad_norm": 0.554348853048081, "learning_rate": 7.31687772592693e-12, "loss": 0.4984, "step": 5770 }, { "epoch": 3.0, "eval_loss": 0.8089174032211304, "eval_runtime": 106.5642, "eval_samples_per_second": 77.043, "eval_steps_per_second": 1.211, "step": 5772 }, { "epoch": 3.0, "step": 5772, "total_flos": 1208539372584960.0, "train_loss": 0.6196737293559317, "train_runtime": 16925.3973, "train_samples_per_second": 21.825, "train_steps_per_second": 0.341 } ], "logging_steps": 5, "max_steps": 5772, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1208539372584960.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }