diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13680 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9998846198223146, + "eval_steps": 500, + "global_step": 9750, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0015384023691396484, + "grad_norm": 0.797955334186554, + "learning_rate": 4.999996755554811e-05, + "loss": 1.901, + "step": 5 + }, + { + "epoch": 0.003076804738279297, + "grad_norm": 1.7458454370498657, + "learning_rate": 4.999987022227664e-05, + "loss": 1.7519, + "step": 10 + }, + { + "epoch": 0.004615207107418946, + "grad_norm": 1.4588245153427124, + "learning_rate": 4.999970800043822e-05, + "loss": 1.8846, + "step": 15 + }, + { + "epoch": 0.006153609476558594, + "grad_norm": 1.4333337545394897, + "learning_rate": 4.9999480890453916e-05, + "loss": 1.7849, + "step": 20 + }, + { + "epoch": 0.007692011845698243, + "grad_norm": 1.8351398706436157, + "learning_rate": 4.99991888929132e-05, + "loss": 1.5929, + "step": 25 + }, + { + "epoch": 0.009230414214837892, + "grad_norm": 1.145352840423584, + "learning_rate": 4.999883200857397e-05, + "loss": 1.4525, + "step": 30 + }, + { + "epoch": 0.010768816583977539, + "grad_norm": 2.642392873764038, + "learning_rate": 4.999841023836254e-05, + "loss": 1.676, + "step": 35 + }, + { + "epoch": 0.012307218953117188, + "grad_norm": 1.1834251880645752, + "learning_rate": 4.999792358337363e-05, + "loss": 1.4053, + "step": 40 + }, + { + "epoch": 0.013845621322256836, + "grad_norm": 1.87557053565979, + "learning_rate": 4.999737204487039e-05, + "loss": 1.395, + "step": 45 + }, + { + "epoch": 0.015384023691396485, + "grad_norm": 1.6970587968826294, + "learning_rate": 4.999675562428437e-05, + "loss": 1.5014, + "step": 50 + }, + { + "epoch": 0.016922426060536132, + "grad_norm": 1.1248029470443726, + "learning_rate": 4.999607432321551e-05, + "loss": 1.2834, + "step": 55 + }, + { + "epoch": 0.018460828429675783, + "grad_norm": 5.078829765319824, + "learning_rate": 4.999532814343219e-05, + "loss": 1.4268, + "step": 60 + }, + { + "epoch": 0.01999923079881543, + "grad_norm": 1.0455766916275024, + "learning_rate": 4.999451708687114e-05, + "loss": 1.33, + "step": 65 + }, + { + "epoch": 0.021537633167955077, + "grad_norm": 1.837597370147705, + "learning_rate": 4.999364115563751e-05, + "loss": 1.3419, + "step": 70 + }, + { + "epoch": 0.023076035537094728, + "grad_norm": 1.0537432432174683, + "learning_rate": 4.999270035200483e-05, + "loss": 1.3378, + "step": 75 + }, + { + "epoch": 0.024614437906234375, + "grad_norm": 2.875886917114258, + "learning_rate": 4.9991694678415e-05, + "loss": 1.263, + "step": 80 + }, + { + "epoch": 0.026152840275374026, + "grad_norm": 0.9554703831672668, + "learning_rate": 4.9990624137478314e-05, + "loss": 1.2777, + "step": 85 + }, + { + "epoch": 0.027691242644513673, + "grad_norm": 1.6160002946853638, + "learning_rate": 4.998948873197342e-05, + "loss": 1.3095, + "step": 90 + }, + { + "epoch": 0.02922964501365332, + "grad_norm": 0.982302188873291, + "learning_rate": 4.998828846484732e-05, + "loss": 1.3788, + "step": 95 + }, + { + "epoch": 0.03076804738279297, + "grad_norm": 1.1846606731414795, + "learning_rate": 4.9987023339215374e-05, + "loss": 1.3818, + "step": 100 + }, + { + "epoch": 0.03230644975193262, + "grad_norm": 1.7230656147003174, + "learning_rate": 4.9985693358361296e-05, + "loss": 1.2638, + "step": 105 + }, + { + "epoch": 0.033844852121072265, + "grad_norm": 1.3569287061691284, + "learning_rate": 4.998429852573712e-05, + "loss": 1.1706, + "step": 110 + }, + { + "epoch": 0.035383254490211916, + "grad_norm": 1.555022120475769, + "learning_rate": 4.998283884496321e-05, + "loss": 1.287, + "step": 115 + }, + { + "epoch": 0.036921656859351566, + "grad_norm": 1.9749863147735596, + "learning_rate": 4.998131431982826e-05, + "loss": 1.1886, + "step": 120 + }, + { + "epoch": 0.03846005922849121, + "grad_norm": 0.8346091508865356, + "learning_rate": 4.9979724954289244e-05, + "loss": 1.2522, + "step": 125 + }, + { + "epoch": 0.03999846159763086, + "grad_norm": 4.200445175170898, + "learning_rate": 4.997807075247146e-05, + "loss": 1.2879, + "step": 130 + }, + { + "epoch": 0.04153686396677051, + "grad_norm": 1.5512927770614624, + "learning_rate": 4.9976351718668476e-05, + "loss": 1.3098, + "step": 135 + }, + { + "epoch": 0.043075266335910155, + "grad_norm": 1.1664855480194092, + "learning_rate": 4.9974567857342155e-05, + "loss": 1.2395, + "step": 140 + }, + { + "epoch": 0.044613668705049805, + "grad_norm": 1.1823747158050537, + "learning_rate": 4.997271917312259e-05, + "loss": 1.2825, + "step": 145 + }, + { + "epoch": 0.046152071074189456, + "grad_norm": 1.0937031507492065, + "learning_rate": 4.997080567080817e-05, + "loss": 1.1966, + "step": 150 + }, + { + "epoch": 0.0476904734433291, + "grad_norm": 0.899167001247406, + "learning_rate": 4.9968827355365465e-05, + "loss": 1.2136, + "step": 155 + }, + { + "epoch": 0.04922887581246875, + "grad_norm": 1.9292325973510742, + "learning_rate": 4.996678423192933e-05, + "loss": 1.2735, + "step": 160 + }, + { + "epoch": 0.0507672781816084, + "grad_norm": 1.0430412292480469, + "learning_rate": 4.9964676305802794e-05, + "loss": 1.3386, + "step": 165 + }, + { + "epoch": 0.05230568055074805, + "grad_norm": 1.879791498184204, + "learning_rate": 4.99625035824571e-05, + "loss": 1.2693, + "step": 170 + }, + { + "epoch": 0.053844082919887695, + "grad_norm": 1.6582942008972168, + "learning_rate": 4.996026606753167e-05, + "loss": 1.1652, + "step": 175 + }, + { + "epoch": 0.055382485289027346, + "grad_norm": 3.5379745960235596, + "learning_rate": 4.99579637668341e-05, + "loss": 1.3028, + "step": 180 + }, + { + "epoch": 0.056920887658166996, + "grad_norm": 3.7615771293640137, + "learning_rate": 4.9955596686340154e-05, + "loss": 1.1635, + "step": 185 + }, + { + "epoch": 0.05845929002730664, + "grad_norm": 0.928205668926239, + "learning_rate": 4.995316483219372e-05, + "loss": 1.4032, + "step": 190 + }, + { + "epoch": 0.05999769239644629, + "grad_norm": 1.0329424142837524, + "learning_rate": 4.995066821070679e-05, + "loss": 1.2417, + "step": 195 + }, + { + "epoch": 0.06153609476558594, + "grad_norm": 1.8608773946762085, + "learning_rate": 4.994810682835951e-05, + "loss": 1.2683, + "step": 200 + }, + { + "epoch": 0.06307449713472559, + "grad_norm": 2.3579938411712646, + "learning_rate": 4.9945480691800075e-05, + "loss": 1.252, + "step": 205 + }, + { + "epoch": 0.06461289950386524, + "grad_norm": 0.7121773958206177, + "learning_rate": 4.994278980784478e-05, + "loss": 1.2303, + "step": 210 + }, + { + "epoch": 0.06615130187300489, + "grad_norm": 0.783092200756073, + "learning_rate": 4.9940034183477954e-05, + "loss": 1.3052, + "step": 215 + }, + { + "epoch": 0.06768970424214453, + "grad_norm": 0.9442629814147949, + "learning_rate": 4.993721382585199e-05, + "loss": 1.3012, + "step": 220 + }, + { + "epoch": 0.06922810661128419, + "grad_norm": 1.0986599922180176, + "learning_rate": 4.9934328742287285e-05, + "loss": 1.1937, + "step": 225 + }, + { + "epoch": 0.07076650898042383, + "grad_norm": 1.1720629930496216, + "learning_rate": 4.9931378940272214e-05, + "loss": 1.2892, + "step": 230 + }, + { + "epoch": 0.07230491134956347, + "grad_norm": 1.6844689846038818, + "learning_rate": 4.992836442746317e-05, + "loss": 1.1496, + "step": 235 + }, + { + "epoch": 0.07384331371870313, + "grad_norm": 1.4238617420196533, + "learning_rate": 4.992528521168449e-05, + "loss": 1.2769, + "step": 240 + }, + { + "epoch": 0.07538171608784278, + "grad_norm": 1.1840909719467163, + "learning_rate": 4.992214130092845e-05, + "loss": 1.2355, + "step": 245 + }, + { + "epoch": 0.07692011845698242, + "grad_norm": 1.3611423969268799, + "learning_rate": 4.9918932703355256e-05, + "loss": 1.3105, + "step": 250 + }, + { + "epoch": 0.07845852082612208, + "grad_norm": 1.2186192274093628, + "learning_rate": 4.991565942729298e-05, + "loss": 1.3476, + "step": 255 + }, + { + "epoch": 0.07999692319526172, + "grad_norm": 1.8337148427963257, + "learning_rate": 4.991232148123761e-05, + "loss": 1.1474, + "step": 260 + }, + { + "epoch": 0.08153532556440136, + "grad_norm": 1.6369004249572754, + "learning_rate": 4.990891887385297e-05, + "loss": 1.1932, + "step": 265 + }, + { + "epoch": 0.08307372793354102, + "grad_norm": 1.483383059501648, + "learning_rate": 4.9905451613970725e-05, + "loss": 1.2814, + "step": 270 + }, + { + "epoch": 0.08461213030268067, + "grad_norm": 0.9386169910430908, + "learning_rate": 4.990191971059033e-05, + "loss": 1.337, + "step": 275 + }, + { + "epoch": 0.08615053267182031, + "grad_norm": 0.8377572894096375, + "learning_rate": 4.989832317287904e-05, + "loss": 1.1444, + "step": 280 + }, + { + "epoch": 0.08768893504095997, + "grad_norm": 2.5681023597717285, + "learning_rate": 4.9894662010171874e-05, + "loss": 1.3671, + "step": 285 + }, + { + "epoch": 0.08922733741009961, + "grad_norm": 0.8709166049957275, + "learning_rate": 4.98909362319716e-05, + "loss": 1.1663, + "step": 290 + }, + { + "epoch": 0.09076573977923925, + "grad_norm": 0.9449151158332825, + "learning_rate": 4.988714584794866e-05, + "loss": 1.2205, + "step": 295 + }, + { + "epoch": 0.09230414214837891, + "grad_norm": 1.1699813604354858, + "learning_rate": 4.988329086794122e-05, + "loss": 1.3335, + "step": 300 + }, + { + "epoch": 0.09384254451751856, + "grad_norm": 1.5694646835327148, + "learning_rate": 4.98793713019551e-05, + "loss": 1.3269, + "step": 305 + }, + { + "epoch": 0.0953809468866582, + "grad_norm": 1.073695421218872, + "learning_rate": 4.9875387160163744e-05, + "loss": 1.2983, + "step": 310 + }, + { + "epoch": 0.09691934925579786, + "grad_norm": 0.9977647066116333, + "learning_rate": 4.987133845290822e-05, + "loss": 1.2401, + "step": 315 + }, + { + "epoch": 0.0984577516249375, + "grad_norm": 1.1257829666137695, + "learning_rate": 4.986722519069719e-05, + "loss": 1.1723, + "step": 320 + }, + { + "epoch": 0.09999615399407714, + "grad_norm": 2.4303648471832275, + "learning_rate": 4.9863047384206835e-05, + "loss": 1.2318, + "step": 325 + }, + { + "epoch": 0.1015345563632168, + "grad_norm": 2.8859667778015137, + "learning_rate": 4.9858805044280895e-05, + "loss": 1.2859, + "step": 330 + }, + { + "epoch": 0.10307295873235645, + "grad_norm": 1.5526022911071777, + "learning_rate": 4.985449818193061e-05, + "loss": 1.26, + "step": 335 + }, + { + "epoch": 0.1046113611014961, + "grad_norm": 1.387646198272705, + "learning_rate": 4.9850126808334665e-05, + "loss": 1.1915, + "step": 340 + }, + { + "epoch": 0.10614976347063575, + "grad_norm": 1.4605071544647217, + "learning_rate": 4.984569093483922e-05, + "loss": 1.2653, + "step": 345 + }, + { + "epoch": 0.10768816583977539, + "grad_norm": 0.9224326610565186, + "learning_rate": 4.984119057295783e-05, + "loss": 1.1685, + "step": 350 + }, + { + "epoch": 0.10922656820891505, + "grad_norm": 0.7252030372619629, + "learning_rate": 4.983662573437143e-05, + "loss": 1.1732, + "step": 355 + }, + { + "epoch": 0.11076497057805469, + "grad_norm": 1.599922776222229, + "learning_rate": 4.9831996430928326e-05, + "loss": 1.144, + "step": 360 + }, + { + "epoch": 0.11230337294719434, + "grad_norm": 1.3098397254943848, + "learning_rate": 4.9827302674644126e-05, + "loss": 1.1434, + "step": 365 + }, + { + "epoch": 0.11384177531633399, + "grad_norm": 1.1649527549743652, + "learning_rate": 4.982254447770175e-05, + "loss": 1.2149, + "step": 370 + }, + { + "epoch": 0.11538017768547364, + "grad_norm": 1.794103741645813, + "learning_rate": 4.981772185245135e-05, + "loss": 1.1748, + "step": 375 + }, + { + "epoch": 0.11691858005461328, + "grad_norm": 0.9207557439804077, + "learning_rate": 4.981283481141034e-05, + "loss": 1.2767, + "step": 380 + }, + { + "epoch": 0.11845698242375294, + "grad_norm": 1.2299586534500122, + "learning_rate": 4.980788336726328e-05, + "loss": 1.2229, + "step": 385 + }, + { + "epoch": 0.11999538479289258, + "grad_norm": 1.6368671655654907, + "learning_rate": 4.980286753286195e-05, + "loss": 1.1892, + "step": 390 + }, + { + "epoch": 0.12153378716203223, + "grad_norm": 0.9977037906646729, + "learning_rate": 4.9797787321225215e-05, + "loss": 1.2199, + "step": 395 + }, + { + "epoch": 0.12307218953117188, + "grad_norm": 1.6430251598358154, + "learning_rate": 4.979264274553905e-05, + "loss": 1.167, + "step": 400 + }, + { + "epoch": 0.12461059190031153, + "grad_norm": 0.9530766606330872, + "learning_rate": 4.97874338191565e-05, + "loss": 1.1379, + "step": 405 + }, + { + "epoch": 0.12614899426945117, + "grad_norm": 1.4140937328338623, + "learning_rate": 4.978216055559761e-05, + "loss": 1.2004, + "step": 410 + }, + { + "epoch": 0.12768739663859083, + "grad_norm": 2.96806001663208, + "learning_rate": 4.9776822968549454e-05, + "loss": 1.1749, + "step": 415 + }, + { + "epoch": 0.12922579900773049, + "grad_norm": 1.0619330406188965, + "learning_rate": 4.977142107186602e-05, + "loss": 1.1946, + "step": 420 + }, + { + "epoch": 0.13076420137687012, + "grad_norm": 3.656414031982422, + "learning_rate": 4.976595487956823e-05, + "loss": 1.1884, + "step": 425 + }, + { + "epoch": 0.13230260374600977, + "grad_norm": 2.3072898387908936, + "learning_rate": 4.976042440584392e-05, + "loss": 1.1723, + "step": 430 + }, + { + "epoch": 0.13384100611514943, + "grad_norm": 1.3326804637908936, + "learning_rate": 4.975482966504772e-05, + "loss": 1.2658, + "step": 435 + }, + { + "epoch": 0.13537940848428906, + "grad_norm": 1.7775696516036987, + "learning_rate": 4.97491706717011e-05, + "loss": 1.2043, + "step": 440 + }, + { + "epoch": 0.13691781085342872, + "grad_norm": 2.0168044567108154, + "learning_rate": 4.97434474404923e-05, + "loss": 1.2228, + "step": 445 + }, + { + "epoch": 0.13845621322256838, + "grad_norm": 1.757367730140686, + "learning_rate": 4.973765998627628e-05, + "loss": 1.0853, + "step": 450 + }, + { + "epoch": 0.139994615591708, + "grad_norm": 1.5851964950561523, + "learning_rate": 4.9731808324074717e-05, + "loss": 1.1649, + "step": 455 + }, + { + "epoch": 0.14153301796084766, + "grad_norm": 0.7482730746269226, + "learning_rate": 4.9725892469075905e-05, + "loss": 1.1394, + "step": 460 + }, + { + "epoch": 0.14307142032998732, + "grad_norm": 1.041838526725769, + "learning_rate": 4.9719912436634796e-05, + "loss": 1.183, + "step": 465 + }, + { + "epoch": 0.14460982269912695, + "grad_norm": 1.5557304620742798, + "learning_rate": 4.97138682422729e-05, + "loss": 1.0956, + "step": 470 + }, + { + "epoch": 0.1461482250682666, + "grad_norm": 2.4157028198242188, + "learning_rate": 4.970775990167826e-05, + "loss": 1.2898, + "step": 475 + }, + { + "epoch": 0.14768662743740626, + "grad_norm": 1.1285796165466309, + "learning_rate": 4.9701587430705415e-05, + "loss": 1.2317, + "step": 480 + }, + { + "epoch": 0.1492250298065459, + "grad_norm": 1.0138782262802124, + "learning_rate": 4.969535084537536e-05, + "loss": 1.2337, + "step": 485 + }, + { + "epoch": 0.15076343217568555, + "grad_norm": 2.229642629623413, + "learning_rate": 4.9689050161875506e-05, + "loss": 1.196, + "step": 490 + }, + { + "epoch": 0.1523018345448252, + "grad_norm": 1.7063117027282715, + "learning_rate": 4.9682685396559627e-05, + "loss": 1.1881, + "step": 495 + }, + { + "epoch": 0.15384023691396484, + "grad_norm": 1.2955132722854614, + "learning_rate": 4.967625656594782e-05, + "loss": 1.1329, + "step": 500 + }, + { + "epoch": 0.1553786392831045, + "grad_norm": 1.562524437904358, + "learning_rate": 4.96697636867265e-05, + "loss": 1.1875, + "step": 505 + }, + { + "epoch": 0.15691704165224415, + "grad_norm": 1.5448559522628784, + "learning_rate": 4.966320677574827e-05, + "loss": 1.2321, + "step": 510 + }, + { + "epoch": 0.15845544402138378, + "grad_norm": 1.5567346811294556, + "learning_rate": 4.9656585850031987e-05, + "loss": 1.3752, + "step": 515 + }, + { + "epoch": 0.15999384639052344, + "grad_norm": 2.398003339767456, + "learning_rate": 4.964990092676263e-05, + "loss": 1.308, + "step": 520 + }, + { + "epoch": 0.1615322487596631, + "grad_norm": 1.0853981971740723, + "learning_rate": 4.964315202329127e-05, + "loss": 1.1672, + "step": 525 + }, + { + "epoch": 0.16307065112880273, + "grad_norm": 1.2827069759368896, + "learning_rate": 4.963633915713509e-05, + "loss": 1.1813, + "step": 530 + }, + { + "epoch": 0.1646090534979424, + "grad_norm": 1.0297755002975464, + "learning_rate": 4.962946234597726e-05, + "loss": 1.224, + "step": 535 + }, + { + "epoch": 0.16614745586708204, + "grad_norm": 1.2967716455459595, + "learning_rate": 4.962252160766693e-05, + "loss": 1.216, + "step": 540 + }, + { + "epoch": 0.16768585823622167, + "grad_norm": 1.1088382005691528, + "learning_rate": 4.961551696021918e-05, + "loss": 1.2665, + "step": 545 + }, + { + "epoch": 0.16922426060536133, + "grad_norm": 1.604500412940979, + "learning_rate": 4.960844842181494e-05, + "loss": 1.2242, + "step": 550 + }, + { + "epoch": 0.170762662974501, + "grad_norm": 0.8669219017028809, + "learning_rate": 4.960131601080104e-05, + "loss": 1.0902, + "step": 555 + }, + { + "epoch": 0.17230106534364062, + "grad_norm": 2.331838369369507, + "learning_rate": 4.9594119745690014e-05, + "loss": 1.2584, + "step": 560 + }, + { + "epoch": 0.17383946771278028, + "grad_norm": 1.547214388847351, + "learning_rate": 4.95868596451602e-05, + "loss": 1.2759, + "step": 565 + }, + { + "epoch": 0.17537787008191993, + "grad_norm": 0.90031498670578, + "learning_rate": 4.957953572805558e-05, + "loss": 1.1871, + "step": 570 + }, + { + "epoch": 0.17691627245105956, + "grad_norm": 1.2103489637374878, + "learning_rate": 4.957214801338581e-05, + "loss": 1.2548, + "step": 575 + }, + { + "epoch": 0.17845467482019922, + "grad_norm": 1.005483865737915, + "learning_rate": 4.956469652032609e-05, + "loss": 1.1955, + "step": 580 + }, + { + "epoch": 0.17999307718933888, + "grad_norm": 0.9381604790687561, + "learning_rate": 4.9557181268217227e-05, + "loss": 1.1725, + "step": 585 + }, + { + "epoch": 0.1815314795584785, + "grad_norm": 1.0934962034225464, + "learning_rate": 4.9549602276565435e-05, + "loss": 1.1608, + "step": 590 + }, + { + "epoch": 0.18306988192761817, + "grad_norm": 1.2921926975250244, + "learning_rate": 4.954195956504245e-05, + "loss": 1.2638, + "step": 595 + }, + { + "epoch": 0.18460828429675782, + "grad_norm": 1.0068799257278442, + "learning_rate": 4.953425315348534e-05, + "loss": 1.1962, + "step": 600 + }, + { + "epoch": 0.18614668666589745, + "grad_norm": 1.205253005027771, + "learning_rate": 4.9526483061896534e-05, + "loss": 1.2266, + "step": 605 + }, + { + "epoch": 0.1876850890350371, + "grad_norm": 2.4699807167053223, + "learning_rate": 4.951864931044374e-05, + "loss": 1.2421, + "step": 610 + }, + { + "epoch": 0.18922349140417677, + "grad_norm": 1.494227409362793, + "learning_rate": 4.9510751919459895e-05, + "loss": 1.0911, + "step": 615 + }, + { + "epoch": 0.1907618937733164, + "grad_norm": 1.6315932273864746, + "learning_rate": 4.950279090944313e-05, + "loss": 1.2937, + "step": 620 + }, + { + "epoch": 0.19230029614245606, + "grad_norm": 1.1606189012527466, + "learning_rate": 4.949476630105669e-05, + "loss": 1.2341, + "step": 625 + }, + { + "epoch": 0.19383869851159571, + "grad_norm": 0.7757031321525574, + "learning_rate": 4.94866781151289e-05, + "loss": 1.1867, + "step": 630 + }, + { + "epoch": 0.19537710088073534, + "grad_norm": 1.1521573066711426, + "learning_rate": 4.9478526372653096e-05, + "loss": 1.2039, + "step": 635 + }, + { + "epoch": 0.196915503249875, + "grad_norm": 0.8454951643943787, + "learning_rate": 4.947031109478758e-05, + "loss": 1.1598, + "step": 640 + }, + { + "epoch": 0.19845390561901466, + "grad_norm": 1.3610525131225586, + "learning_rate": 4.9462032302855576e-05, + "loss": 1.2351, + "step": 645 + }, + { + "epoch": 0.1999923079881543, + "grad_norm": 1.006761908531189, + "learning_rate": 4.9453690018345144e-05, + "loss": 1.3258, + "step": 650 + }, + { + "epoch": 0.20153071035729395, + "grad_norm": 1.7177060842514038, + "learning_rate": 4.9445284262909156e-05, + "loss": 1.2972, + "step": 655 + }, + { + "epoch": 0.2030691127264336, + "grad_norm": 1.063753604888916, + "learning_rate": 4.943681505836523e-05, + "loss": 1.1798, + "step": 660 + }, + { + "epoch": 0.20460751509557323, + "grad_norm": 1.1765042543411255, + "learning_rate": 4.9428282426695646e-05, + "loss": 1.2289, + "step": 665 + }, + { + "epoch": 0.2061459174647129, + "grad_norm": 1.552025556564331, + "learning_rate": 4.9419686390047334e-05, + "loss": 1.2983, + "step": 670 + }, + { + "epoch": 0.20768431983385255, + "grad_norm": 2.3030500411987305, + "learning_rate": 4.9411026970731805e-05, + "loss": 1.1431, + "step": 675 + }, + { + "epoch": 0.2092227222029922, + "grad_norm": 1.2778570652008057, + "learning_rate": 4.9402304191225044e-05, + "loss": 1.1139, + "step": 680 + }, + { + "epoch": 0.21076112457213184, + "grad_norm": 1.5465021133422852, + "learning_rate": 4.9393518074167536e-05, + "loss": 1.1983, + "step": 685 + }, + { + "epoch": 0.2122995269412715, + "grad_norm": 1.079221248626709, + "learning_rate": 4.9384668642364126e-05, + "loss": 1.1707, + "step": 690 + }, + { + "epoch": 0.21383792931041115, + "grad_norm": 0.9178045988082886, + "learning_rate": 4.937575591878403e-05, + "loss": 1.2633, + "step": 695 + }, + { + "epoch": 0.21537633167955078, + "grad_norm": 1.1767947673797607, + "learning_rate": 4.93667799265607e-05, + "loss": 1.1424, + "step": 700 + }, + { + "epoch": 0.21691473404869044, + "grad_norm": 1.7399604320526123, + "learning_rate": 4.935774068899184e-05, + "loss": 1.2006, + "step": 705 + }, + { + "epoch": 0.2184531364178301, + "grad_norm": 1.282569169998169, + "learning_rate": 4.934863822953929e-05, + "loss": 1.31, + "step": 710 + }, + { + "epoch": 0.21999153878696973, + "grad_norm": 1.0059244632720947, + "learning_rate": 4.933947257182901e-05, + "loss": 1.2209, + "step": 715 + }, + { + "epoch": 0.22152994115610938, + "grad_norm": 1.1119279861450195, + "learning_rate": 4.9330243739650964e-05, + "loss": 1.2628, + "step": 720 + }, + { + "epoch": 0.22306834352524904, + "grad_norm": 1.2987867593765259, + "learning_rate": 4.932095175695911e-05, + "loss": 1.2771, + "step": 725 + }, + { + "epoch": 0.22460674589438867, + "grad_norm": 2.659963846206665, + "learning_rate": 4.9311596647871317e-05, + "loss": 1.2315, + "step": 730 + }, + { + "epoch": 0.22614514826352833, + "grad_norm": 1.846644639968872, + "learning_rate": 4.9302178436669286e-05, + "loss": 1.1513, + "step": 735 + }, + { + "epoch": 0.22768355063266799, + "grad_norm": 1.6089102029800415, + "learning_rate": 4.929269714779852e-05, + "loss": 1.2085, + "step": 740 + }, + { + "epoch": 0.22922195300180762, + "grad_norm": 1.953465223312378, + "learning_rate": 4.9283152805868235e-05, + "loss": 1.2173, + "step": 745 + }, + { + "epoch": 0.23076035537094727, + "grad_norm": 1.127911925315857, + "learning_rate": 4.92735454356513e-05, + "loss": 1.2868, + "step": 750 + }, + { + "epoch": 0.23229875774008693, + "grad_norm": 1.9900950193405151, + "learning_rate": 4.9263875062084194e-05, + "loss": 1.2541, + "step": 755 + }, + { + "epoch": 0.23383716010922656, + "grad_norm": 1.0306469202041626, + "learning_rate": 4.925414171026691e-05, + "loss": 1.2301, + "step": 760 + }, + { + "epoch": 0.23537556247836622, + "grad_norm": 2.1911792755126953, + "learning_rate": 4.9244345405462903e-05, + "loss": 1.2235, + "step": 765 + }, + { + "epoch": 0.23691396484750588, + "grad_norm": 1.6600091457366943, + "learning_rate": 4.923448617309905e-05, + "loss": 1.2173, + "step": 770 + }, + { + "epoch": 0.2384523672166455, + "grad_norm": 1.517933964729309, + "learning_rate": 4.922456403876552e-05, + "loss": 1.2161, + "step": 775 + }, + { + "epoch": 0.23999076958578516, + "grad_norm": 2.4637537002563477, + "learning_rate": 4.9214579028215776e-05, + "loss": 1.1367, + "step": 780 + }, + { + "epoch": 0.24152917195492482, + "grad_norm": 1.1146929264068604, + "learning_rate": 4.9204531167366485e-05, + "loss": 1.2063, + "step": 785 + }, + { + "epoch": 0.24306757432406445, + "grad_norm": 1.044700026512146, + "learning_rate": 4.919442048229743e-05, + "loss": 1.2129, + "step": 790 + }, + { + "epoch": 0.2446059766932041, + "grad_norm": 1.2291778326034546, + "learning_rate": 4.918424699925145e-05, + "loss": 1.1697, + "step": 795 + }, + { + "epoch": 0.24614437906234377, + "grad_norm": 2.0124595165252686, + "learning_rate": 4.917401074463441e-05, + "loss": 1.1384, + "step": 800 + }, + { + "epoch": 0.2476827814314834, + "grad_norm": 1.1294913291931152, + "learning_rate": 4.916371174501507e-05, + "loss": 1.1691, + "step": 805 + }, + { + "epoch": 0.24922118380062305, + "grad_norm": 2.2073304653167725, + "learning_rate": 4.9153350027125064e-05, + "loss": 1.1847, + "step": 810 + }, + { + "epoch": 0.2507595861697627, + "grad_norm": 0.9351872205734253, + "learning_rate": 4.9142925617858814e-05, + "loss": 1.1932, + "step": 815 + }, + { + "epoch": 0.25229798853890234, + "grad_norm": 1.3290927410125732, + "learning_rate": 4.913243854427346e-05, + "loss": 1.2509, + "step": 820 + }, + { + "epoch": 0.253836390908042, + "grad_norm": 1.7626986503601074, + "learning_rate": 4.9121888833588795e-05, + "loss": 1.2265, + "step": 825 + }, + { + "epoch": 0.25537479327718166, + "grad_norm": 1.9079293012619019, + "learning_rate": 4.911127651318717e-05, + "loss": 1.2083, + "step": 830 + }, + { + "epoch": 0.2569131956463213, + "grad_norm": 1.2099313735961914, + "learning_rate": 4.910060161061347e-05, + "loss": 1.0772, + "step": 835 + }, + { + "epoch": 0.25845159801546097, + "grad_norm": 2.2780282497406006, + "learning_rate": 4.9089864153575016e-05, + "loss": 1.1247, + "step": 840 + }, + { + "epoch": 0.2599900003846006, + "grad_norm": 1.4330651760101318, + "learning_rate": 4.907906416994146e-05, + "loss": 1.2317, + "step": 845 + }, + { + "epoch": 0.26152840275374023, + "grad_norm": 1.3475781679153442, + "learning_rate": 4.906820168774477e-05, + "loss": 1.2299, + "step": 850 + }, + { + "epoch": 0.2630668051228799, + "grad_norm": 1.4789565801620483, + "learning_rate": 4.905727673517914e-05, + "loss": 1.1717, + "step": 855 + }, + { + "epoch": 0.26460520749201955, + "grad_norm": 1.7257953882217407, + "learning_rate": 4.904628934060088e-05, + "loss": 1.1482, + "step": 860 + }, + { + "epoch": 0.2661436098611592, + "grad_norm": 1.5117738246917725, + "learning_rate": 4.903523953252841e-05, + "loss": 1.1459, + "step": 865 + }, + { + "epoch": 0.26768201223029886, + "grad_norm": 1.9107036590576172, + "learning_rate": 4.902412733964211e-05, + "loss": 1.1814, + "step": 870 + }, + { + "epoch": 0.26922041459943846, + "grad_norm": 1.9499353170394897, + "learning_rate": 4.901295279078431e-05, + "loss": 1.1091, + "step": 875 + }, + { + "epoch": 0.2707588169685781, + "grad_norm": 0.968377947807312, + "learning_rate": 4.900171591495918e-05, + "loss": 1.1259, + "step": 880 + }, + { + "epoch": 0.2722972193377178, + "grad_norm": 1.4124119281768799, + "learning_rate": 4.899041674133265e-05, + "loss": 1.227, + "step": 885 + }, + { + "epoch": 0.27383562170685743, + "grad_norm": 2.3951656818389893, + "learning_rate": 4.8979055299232376e-05, + "loss": 1.2486, + "step": 890 + }, + { + "epoch": 0.2753740240759971, + "grad_norm": 1.0433672666549683, + "learning_rate": 4.896763161814761e-05, + "loss": 1.2767, + "step": 895 + }, + { + "epoch": 0.27691242644513675, + "grad_norm": 3.254819393157959, + "learning_rate": 4.8956145727729156e-05, + "loss": 1.1728, + "step": 900 + }, + { + "epoch": 0.27845082881427635, + "grad_norm": 1.5969161987304688, + "learning_rate": 4.894459765778929e-05, + "loss": 1.0947, + "step": 905 + }, + { + "epoch": 0.279989231183416, + "grad_norm": 1.0399632453918457, + "learning_rate": 4.893298743830168e-05, + "loss": 1.1474, + "step": 910 + }, + { + "epoch": 0.28152763355255567, + "grad_norm": 0.8495728373527527, + "learning_rate": 4.89213150994013e-05, + "loss": 1.1966, + "step": 915 + }, + { + "epoch": 0.2830660359216953, + "grad_norm": 1.0868245363235474, + "learning_rate": 4.890958067138436e-05, + "loss": 1.194, + "step": 920 + }, + { + "epoch": 0.284604438290835, + "grad_norm": 1.8804415464401245, + "learning_rate": 4.889778418470823e-05, + "loss": 1.1109, + "step": 925 + }, + { + "epoch": 0.28614284065997464, + "grad_norm": 2.711575508117676, + "learning_rate": 4.8885925669991346e-05, + "loss": 1.1817, + "step": 930 + }, + { + "epoch": 0.28768124302911424, + "grad_norm": 1.0217710733413696, + "learning_rate": 4.887400515801315e-05, + "loss": 1.1885, + "step": 935 + }, + { + "epoch": 0.2892196453982539, + "grad_norm": 2.7488129138946533, + "learning_rate": 4.886202267971401e-05, + "loss": 1.1094, + "step": 940 + }, + { + "epoch": 0.29075804776739356, + "grad_norm": 1.5893644094467163, + "learning_rate": 4.8849978266195114e-05, + "loss": 1.1363, + "step": 945 + }, + { + "epoch": 0.2922964501365332, + "grad_norm": 1.7660548686981201, + "learning_rate": 4.883787194871841e-05, + "loss": 1.1993, + "step": 950 + }, + { + "epoch": 0.29383485250567287, + "grad_norm": 1.6953208446502686, + "learning_rate": 4.882570375870653e-05, + "loss": 1.2974, + "step": 955 + }, + { + "epoch": 0.29537325487481253, + "grad_norm": 1.191604733467102, + "learning_rate": 4.88134737277427e-05, + "loss": 1.0904, + "step": 960 + }, + { + "epoch": 0.29691165724395213, + "grad_norm": 1.317108392715454, + "learning_rate": 4.880118188757064e-05, + "loss": 1.1945, + "step": 965 + }, + { + "epoch": 0.2984500596130918, + "grad_norm": 1.4262393712997437, + "learning_rate": 4.878882827009452e-05, + "loss": 1.1332, + "step": 970 + }, + { + "epoch": 0.29998846198223145, + "grad_norm": 1.2089824676513672, + "learning_rate": 4.877641290737884e-05, + "loss": 1.1619, + "step": 975 + }, + { + "epoch": 0.3015268643513711, + "grad_norm": 1.008278727531433, + "learning_rate": 4.8763935831648374e-05, + "loss": 1.2385, + "step": 980 + }, + { + "epoch": 0.30306526672051076, + "grad_norm": 1.3324832916259766, + "learning_rate": 4.8751397075288084e-05, + "loss": 1.1853, + "step": 985 + }, + { + "epoch": 0.3046036690896504, + "grad_norm": 1.293948769569397, + "learning_rate": 4.8738796670843004e-05, + "loss": 1.17, + "step": 990 + }, + { + "epoch": 0.30614207145879, + "grad_norm": 1.5684713125228882, + "learning_rate": 4.8726134651018194e-05, + "loss": 1.1836, + "step": 995 + }, + { + "epoch": 0.3076804738279297, + "grad_norm": 1.8155791759490967, + "learning_rate": 4.8713411048678635e-05, + "loss": 1.0884, + "step": 1000 + }, + { + "epoch": 0.30921887619706934, + "grad_norm": 1.2308273315429688, + "learning_rate": 4.870062589684916e-05, + "loss": 1.2415, + "step": 1005 + }, + { + "epoch": 0.310757278566209, + "grad_norm": 1.091968297958374, + "learning_rate": 4.868777922871434e-05, + "loss": 1.1733, + "step": 1010 + }, + { + "epoch": 0.31229568093534865, + "grad_norm": 1.1831902265548706, + "learning_rate": 4.8674871077618424e-05, + "loss": 1.2495, + "step": 1015 + }, + { + "epoch": 0.3138340833044883, + "grad_norm": 1.0130064487457275, + "learning_rate": 4.8661901477065244e-05, + "loss": 1.2221, + "step": 1020 + }, + { + "epoch": 0.3153724856736279, + "grad_norm": 1.518776535987854, + "learning_rate": 4.864887046071813e-05, + "loss": 1.2491, + "step": 1025 + }, + { + "epoch": 0.31691088804276757, + "grad_norm": 2.234370708465576, + "learning_rate": 4.863577806239982e-05, + "loss": 1.2308, + "step": 1030 + }, + { + "epoch": 0.3184492904119072, + "grad_norm": 1.2248209714889526, + "learning_rate": 4.862262431609235e-05, + "loss": 1.1884, + "step": 1035 + }, + { + "epoch": 0.3199876927810469, + "grad_norm": 1.0601451396942139, + "learning_rate": 4.860940925593703e-05, + "loss": 1.0993, + "step": 1040 + }, + { + "epoch": 0.32152609515018654, + "grad_norm": 1.8158398866653442, + "learning_rate": 4.859613291623428e-05, + "loss": 1.267, + "step": 1045 + }, + { + "epoch": 0.3230644975193262, + "grad_norm": 1.0854904651641846, + "learning_rate": 4.858279533144358e-05, + "loss": 1.185, + "step": 1050 + }, + { + "epoch": 0.3246028998884658, + "grad_norm": 1.392730474472046, + "learning_rate": 4.856939653618339e-05, + "loss": 1.1568, + "step": 1055 + }, + { + "epoch": 0.32614130225760546, + "grad_norm": 1.2848827838897705, + "learning_rate": 4.855593656523103e-05, + "loss": 0.9792, + "step": 1060 + }, + { + "epoch": 0.3276797046267451, + "grad_norm": 1.0641793012619019, + "learning_rate": 4.8542415453522615e-05, + "loss": 1.0783, + "step": 1065 + }, + { + "epoch": 0.3292181069958848, + "grad_norm": 1.3434879779815674, + "learning_rate": 4.852883323615295e-05, + "loss": 1.1971, + "step": 1070 + }, + { + "epoch": 0.33075650936502443, + "grad_norm": 1.8678319454193115, + "learning_rate": 4.8515189948375434e-05, + "loss": 1.1928, + "step": 1075 + }, + { + "epoch": 0.3322949117341641, + "grad_norm": 0.9058634638786316, + "learning_rate": 4.8501485625601996e-05, + "loss": 1.2081, + "step": 1080 + }, + { + "epoch": 0.33383331410330375, + "grad_norm": 1.7286523580551147, + "learning_rate": 4.848772030340297e-05, + "loss": 1.1119, + "step": 1085 + }, + { + "epoch": 0.33537171647244335, + "grad_norm": 0.9546146392822266, + "learning_rate": 4.847389401750701e-05, + "loss": 1.2201, + "step": 1090 + }, + { + "epoch": 0.336910118841583, + "grad_norm": 1.6596899032592773, + "learning_rate": 4.846000680380105e-05, + "loss": 1.2032, + "step": 1095 + }, + { + "epoch": 0.33844852121072266, + "grad_norm": 1.3017821311950684, + "learning_rate": 4.8446058698330115e-05, + "loss": 1.0737, + "step": 1100 + }, + { + "epoch": 0.3399869235798623, + "grad_norm": 1.556097149848938, + "learning_rate": 4.843204973729729e-05, + "loss": 1.2315, + "step": 1105 + }, + { + "epoch": 0.341525325949002, + "grad_norm": 1.3806447982788086, + "learning_rate": 4.8417979957063624e-05, + "loss": 1.1993, + "step": 1110 + }, + { + "epoch": 0.34306372831814164, + "grad_norm": 1.5958844423294067, + "learning_rate": 4.8403849394148025e-05, + "loss": 1.1318, + "step": 1115 + }, + { + "epoch": 0.34460213068728124, + "grad_norm": 1.1803624629974365, + "learning_rate": 4.838965808522716e-05, + "loss": 1.1404, + "step": 1120 + }, + { + "epoch": 0.3461405330564209, + "grad_norm": 1.1995015144348145, + "learning_rate": 4.837540606713538e-05, + "loss": 1.0803, + "step": 1125 + }, + { + "epoch": 0.34767893542556055, + "grad_norm": 1.4723166227340698, + "learning_rate": 4.836109337686457e-05, + "loss": 1.2123, + "step": 1130 + }, + { + "epoch": 0.3492173377947002, + "grad_norm": 1.0283982753753662, + "learning_rate": 4.8346720051564144e-05, + "loss": 1.1247, + "step": 1135 + }, + { + "epoch": 0.35075574016383987, + "grad_norm": 1.0858715772628784, + "learning_rate": 4.833228612854087e-05, + "loss": 1.1716, + "step": 1140 + }, + { + "epoch": 0.3522941425329795, + "grad_norm": 0.8430686593055725, + "learning_rate": 4.831779164525881e-05, + "loss": 1.3617, + "step": 1145 + }, + { + "epoch": 0.35383254490211913, + "grad_norm": 3.4253408908843994, + "learning_rate": 4.83032366393392e-05, + "loss": 1.1845, + "step": 1150 + }, + { + "epoch": 0.3553709472712588, + "grad_norm": 1.2169939279556274, + "learning_rate": 4.828862114856038e-05, + "loss": 1.1945, + "step": 1155 + }, + { + "epoch": 0.35690934964039844, + "grad_norm": 0.8925756216049194, + "learning_rate": 4.827394521085767e-05, + "loss": 1.2021, + "step": 1160 + }, + { + "epoch": 0.3584477520095381, + "grad_norm": 1.1597083806991577, + "learning_rate": 4.8259208864323304e-05, + "loss": 1.1994, + "step": 1165 + }, + { + "epoch": 0.35998615437867776, + "grad_norm": 1.1406500339508057, + "learning_rate": 4.8244412147206284e-05, + "loss": 1.1749, + "step": 1170 + }, + { + "epoch": 0.3615245567478174, + "grad_norm": 0.8532546162605286, + "learning_rate": 4.822955509791233e-05, + "loss": 1.0968, + "step": 1175 + }, + { + "epoch": 0.363062959116957, + "grad_norm": 1.6647778749465942, + "learning_rate": 4.8214637755003745e-05, + "loss": 1.1396, + "step": 1180 + }, + { + "epoch": 0.3646013614860967, + "grad_norm": 4.057651042938232, + "learning_rate": 4.819966015719933e-05, + "loss": 1.2997, + "step": 1185 + }, + { + "epoch": 0.36613976385523633, + "grad_norm": 0.9485501050949097, + "learning_rate": 4.8184622343374275e-05, + "loss": 1.2487, + "step": 1190 + }, + { + "epoch": 0.367678166224376, + "grad_norm": 1.2841589450836182, + "learning_rate": 4.8169524352560076e-05, + "loss": 1.1749, + "step": 1195 + }, + { + "epoch": 0.36921656859351565, + "grad_norm": 1.6274491548538208, + "learning_rate": 4.815436622394441e-05, + "loss": 1.2342, + "step": 1200 + }, + { + "epoch": 0.3707549709626553, + "grad_norm": 1.1453361511230469, + "learning_rate": 4.813914799687107e-05, + "loss": 1.1538, + "step": 1205 + }, + { + "epoch": 0.3722933733317949, + "grad_norm": 1.7069777250289917, + "learning_rate": 4.812386971083979e-05, + "loss": 1.0581, + "step": 1210 + }, + { + "epoch": 0.37383177570093457, + "grad_norm": 1.478285789489746, + "learning_rate": 4.810853140550624e-05, + "loss": 1.1604, + "step": 1215 + }, + { + "epoch": 0.3753701780700742, + "grad_norm": 2.3933002948760986, + "learning_rate": 4.809313312068185e-05, + "loss": 1.2667, + "step": 1220 + }, + { + "epoch": 0.3769085804392139, + "grad_norm": 1.7244596481323242, + "learning_rate": 4.8077674896333725e-05, + "loss": 1.2328, + "step": 1225 + }, + { + "epoch": 0.37844698280835354, + "grad_norm": 0.8237632513046265, + "learning_rate": 4.806215677258456e-05, + "loss": 1.1748, + "step": 1230 + }, + { + "epoch": 0.3799853851774932, + "grad_norm": 1.9397246837615967, + "learning_rate": 4.8046578789712515e-05, + "loss": 1.1827, + "step": 1235 + }, + { + "epoch": 0.3815237875466328, + "grad_norm": 1.060905933380127, + "learning_rate": 4.803094098815112e-05, + "loss": 1.1902, + "step": 1240 + }, + { + "epoch": 0.38306218991577246, + "grad_norm": 1.2361557483673096, + "learning_rate": 4.801524340848917e-05, + "loss": 1.1679, + "step": 1245 + }, + { + "epoch": 0.3846005922849121, + "grad_norm": 1.1130070686340332, + "learning_rate": 4.799948609147061e-05, + "loss": 1.2108, + "step": 1250 + }, + { + "epoch": 0.38613899465405177, + "grad_norm": 1.0235941410064697, + "learning_rate": 4.798366907799444e-05, + "loss": 1.2093, + "step": 1255 + }, + { + "epoch": 0.38767739702319143, + "grad_norm": 1.1517040729522705, + "learning_rate": 4.7967792409114606e-05, + "loss": 1.1166, + "step": 1260 + }, + { + "epoch": 0.3892157993923311, + "grad_norm": 1.4021333456039429, + "learning_rate": 4.795185612603991e-05, + "loss": 1.1535, + "step": 1265 + }, + { + "epoch": 0.3907542017614707, + "grad_norm": 1.059432864189148, + "learning_rate": 4.7935860270133844e-05, + "loss": 1.266, + "step": 1270 + }, + { + "epoch": 0.39229260413061035, + "grad_norm": 1.1297060251235962, + "learning_rate": 4.791980488291456e-05, + "loss": 1.2552, + "step": 1275 + }, + { + "epoch": 0.39383100649975, + "grad_norm": 2.4244351387023926, + "learning_rate": 4.7903690006054726e-05, + "loss": 1.2457, + "step": 1280 + }, + { + "epoch": 0.39536940886888966, + "grad_norm": 1.4860352277755737, + "learning_rate": 4.7887515681381403e-05, + "loss": 1.0594, + "step": 1285 + }, + { + "epoch": 0.3969078112380293, + "grad_norm": 1.5482337474822998, + "learning_rate": 4.787128195087596e-05, + "loss": 1.1454, + "step": 1290 + }, + { + "epoch": 0.398446213607169, + "grad_norm": 1.3812161684036255, + "learning_rate": 4.785498885667395e-05, + "loss": 1.212, + "step": 1295 + }, + { + "epoch": 0.3999846159763086, + "grad_norm": 1.555119276046753, + "learning_rate": 4.783863644106502e-05, + "loss": 1.2743, + "step": 1300 + }, + { + "epoch": 0.40152301834544823, + "grad_norm": 0.8972929120063782, + "learning_rate": 4.782222474649279e-05, + "loss": 1.1012, + "step": 1305 + }, + { + "epoch": 0.4030614207145879, + "grad_norm": 1.5616463422775269, + "learning_rate": 4.780575381555472e-05, + "loss": 1.0773, + "step": 1310 + }, + { + "epoch": 0.40459982308372755, + "grad_norm": 1.2159663438796997, + "learning_rate": 4.778922369100204e-05, + "loss": 1.3538, + "step": 1315 + }, + { + "epoch": 0.4061382254528672, + "grad_norm": 1.9486379623413086, + "learning_rate": 4.7772634415739624e-05, + "loss": 1.1075, + "step": 1320 + }, + { + "epoch": 0.40767662782200687, + "grad_norm": 0.9763344526290894, + "learning_rate": 4.7755986032825864e-05, + "loss": 1.2541, + "step": 1325 + }, + { + "epoch": 0.40921503019114647, + "grad_norm": 1.8571903705596924, + "learning_rate": 4.7739278585472573e-05, + "loss": 1.1718, + "step": 1330 + }, + { + "epoch": 0.4107534325602861, + "grad_norm": 1.4408457279205322, + "learning_rate": 4.7722512117044865e-05, + "loss": 1.3156, + "step": 1335 + }, + { + "epoch": 0.4122918349294258, + "grad_norm": 0.9971082806587219, + "learning_rate": 4.7705686671061054e-05, + "loss": 1.2319, + "step": 1340 + }, + { + "epoch": 0.41383023729856544, + "grad_norm": 1.3525968790054321, + "learning_rate": 4.768880229119253e-05, + "loss": 1.1831, + "step": 1345 + }, + { + "epoch": 0.4153686396677051, + "grad_norm": 1.2504761219024658, + "learning_rate": 4.767185902126364e-05, + "loss": 1.1658, + "step": 1350 + }, + { + "epoch": 0.41690704203684475, + "grad_norm": 1.413370132446289, + "learning_rate": 4.76548569052516e-05, + "loss": 1.1391, + "step": 1355 + }, + { + "epoch": 0.4184454444059844, + "grad_norm": 1.1849104166030884, + "learning_rate": 4.763779598728636e-05, + "loss": 1.1059, + "step": 1360 + }, + { + "epoch": 0.419983846775124, + "grad_norm": 1.1771825551986694, + "learning_rate": 4.762067631165049e-05, + "loss": 1.1448, + "step": 1365 + }, + { + "epoch": 0.42152224914426367, + "grad_norm": 1.4707238674163818, + "learning_rate": 4.760349792277906e-05, + "loss": 1.1634, + "step": 1370 + }, + { + "epoch": 0.42306065151340333, + "grad_norm": 1.577568769454956, + "learning_rate": 4.758626086525956e-05, + "loss": 1.1911, + "step": 1375 + }, + { + "epoch": 0.424599053882543, + "grad_norm": NaN, + "learning_rate": 4.757242900787734e-05, + "loss": 1.1134, + "step": 1380 + }, + { + "epoch": 0.42613745625168264, + "grad_norm": 1.0331732034683228, + "learning_rate": 4.755508645963771e-05, + "loss": 1.2352, + "step": 1385 + }, + { + "epoch": 0.4276758586208223, + "grad_norm": 1.8902751207351685, + "learning_rate": 4.7537685368404684e-05, + "loss": 1.1271, + "step": 1390 + }, + { + "epoch": 0.4292142609899619, + "grad_norm": 1.1057363748550415, + "learning_rate": 4.7520225779343754e-05, + "loss": 1.1633, + "step": 1395 + }, + { + "epoch": 0.43075266335910156, + "grad_norm": 1.183310627937317, + "learning_rate": 4.7502707737772264e-05, + "loss": 1.0869, + "step": 1400 + }, + { + "epoch": 0.4322910657282412, + "grad_norm": 1.2959816455841064, + "learning_rate": 4.7485131289159276e-05, + "loss": 1.3126, + "step": 1405 + }, + { + "epoch": 0.4338294680973809, + "grad_norm": 1.5542187690734863, + "learning_rate": 4.746749647912546e-05, + "loss": 1.2092, + "step": 1410 + }, + { + "epoch": 0.43536787046652053, + "grad_norm": 0.8856958746910095, + "learning_rate": 4.7449803353442945e-05, + "loss": 1.2647, + "step": 1415 + }, + { + "epoch": 0.4369062728356602, + "grad_norm": 1.3472665548324585, + "learning_rate": 4.743205195803523e-05, + "loss": 1.2187, + "step": 1420 + }, + { + "epoch": 0.4384446752047998, + "grad_norm": 1.7011195421218872, + "learning_rate": 4.741424233897707e-05, + "loss": 1.2245, + "step": 1425 + }, + { + "epoch": 0.43998307757393945, + "grad_norm": 2.0465426445007324, + "learning_rate": 4.7396374542494314e-05, + "loss": 1.2666, + "step": 1430 + }, + { + "epoch": 0.4415214799430791, + "grad_norm": 1.2540628910064697, + "learning_rate": 4.737844861496385e-05, + "loss": 1.0815, + "step": 1435 + }, + { + "epoch": 0.44305988231221877, + "grad_norm": 1.3363851308822632, + "learning_rate": 4.736046460291342e-05, + "loss": 1.2059, + "step": 1440 + }, + { + "epoch": 0.4445982846813584, + "grad_norm": 1.0009596347808838, + "learning_rate": 4.734242255302154e-05, + "loss": 1.3611, + "step": 1445 + }, + { + "epoch": 0.4461366870504981, + "grad_norm": 0.9863151907920837, + "learning_rate": 4.732432251211735e-05, + "loss": 1.1654, + "step": 1450 + }, + { + "epoch": 0.4476750894196377, + "grad_norm": 0.8969554901123047, + "learning_rate": 4.7306164527180546e-05, + "loss": 1.1962, + "step": 1455 + }, + { + "epoch": 0.44921349178877734, + "grad_norm": 1.1396467685699463, + "learning_rate": 4.7287948645341185e-05, + "loss": 1.2541, + "step": 1460 + }, + { + "epoch": 0.450751894157917, + "grad_norm": 1.4073244333267212, + "learning_rate": 4.72696749138796e-05, + "loss": 1.1695, + "step": 1465 + }, + { + "epoch": 0.45229029652705666, + "grad_norm": 2.4806954860687256, + "learning_rate": 4.725134338022631e-05, + "loss": 1.2109, + "step": 1470 + }, + { + "epoch": 0.4538286988961963, + "grad_norm": 1.7406140565872192, + "learning_rate": 4.723295409196183e-05, + "loss": 1.1345, + "step": 1475 + }, + { + "epoch": 0.45536710126533597, + "grad_norm": 0.8337318897247314, + "learning_rate": 4.721450709681658e-05, + "loss": 1.1668, + "step": 1480 + }, + { + "epoch": 0.4569055036344756, + "grad_norm": 2.3858370780944824, + "learning_rate": 4.7196002442670794e-05, + "loss": 1.2392, + "step": 1485 + }, + { + "epoch": 0.45844390600361523, + "grad_norm": 1.21785306930542, + "learning_rate": 4.7177440177554324e-05, + "loss": 1.2615, + "step": 1490 + }, + { + "epoch": 0.4599823083727549, + "grad_norm": 1.8882992267608643, + "learning_rate": 4.715882034964657e-05, + "loss": 1.3384, + "step": 1495 + }, + { + "epoch": 0.46152071074189455, + "grad_norm": 1.1433368921279907, + "learning_rate": 4.714014300727634e-05, + "loss": 1.2788, + "step": 1500 + }, + { + "epoch": 0.4630591131110342, + "grad_norm": 1.1393554210662842, + "learning_rate": 4.7121408198921745e-05, + "loss": 1.1502, + "step": 1505 + }, + { + "epoch": 0.46459751548017386, + "grad_norm": 1.1349945068359375, + "learning_rate": 4.7102615973210004e-05, + "loss": 1.1128, + "step": 1510 + }, + { + "epoch": 0.46613591784931346, + "grad_norm": 1.2287825345993042, + "learning_rate": 4.708376637891742e-05, + "loss": 1.1704, + "step": 1515 + }, + { + "epoch": 0.4676743202184531, + "grad_norm": 1.257469892501831, + "learning_rate": 4.706485946496916e-05, + "loss": 1.1092, + "step": 1520 + }, + { + "epoch": 0.4692127225875928, + "grad_norm": 1.4764573574066162, + "learning_rate": 4.704589528043918e-05, + "loss": 1.1372, + "step": 1525 + }, + { + "epoch": 0.47075112495673244, + "grad_norm": 1.0339967012405396, + "learning_rate": 4.702687387455008e-05, + "loss": 1.0919, + "step": 1530 + }, + { + "epoch": 0.4722895273258721, + "grad_norm": 1.5123881101608276, + "learning_rate": 4.7007795296673006e-05, + "loss": 1.2167, + "step": 1535 + }, + { + "epoch": 0.47382792969501175, + "grad_norm": 1.475289225578308, + "learning_rate": 4.6988659596327465e-05, + "loss": 1.2096, + "step": 1540 + }, + { + "epoch": 0.47536633206415135, + "grad_norm": 1.229851245880127, + "learning_rate": 4.696946682318124e-05, + "loss": 1.1183, + "step": 1545 + }, + { + "epoch": 0.476904734433291, + "grad_norm": 0.7345393896102905, + "learning_rate": 4.695021702705026e-05, + "loss": 1.2528, + "step": 1550 + }, + { + "epoch": 0.47844313680243067, + "grad_norm": 1.239741563796997, + "learning_rate": 4.693091025789845e-05, + "loss": 1.1355, + "step": 1555 + }, + { + "epoch": 0.4799815391715703, + "grad_norm": 1.3871427774429321, + "learning_rate": 4.6911546565837605e-05, + "loss": 1.1942, + "step": 1560 + }, + { + "epoch": 0.48151994154071, + "grad_norm": 1.9207137823104858, + "learning_rate": 4.689212600112728e-05, + "loss": 1.1756, + "step": 1565 + }, + { + "epoch": 0.48305834390984964, + "grad_norm": 1.4990715980529785, + "learning_rate": 4.687264861417464e-05, + "loss": 1.1363, + "step": 1570 + }, + { + "epoch": 0.48459674627898924, + "grad_norm": 1.9793407917022705, + "learning_rate": 4.6853114455534345e-05, + "loss": 1.1298, + "step": 1575 + }, + { + "epoch": 0.4861351486481289, + "grad_norm": 1.1142261028289795, + "learning_rate": 4.683352357590839e-05, + "loss": 1.0793, + "step": 1580 + }, + { + "epoch": 0.48767355101726856, + "grad_norm": 1.4216675758361816, + "learning_rate": 4.6813876026146007e-05, + "loss": 1.1616, + "step": 1585 + }, + { + "epoch": 0.4892119533864082, + "grad_norm": 1.3200958967208862, + "learning_rate": 4.679417185724352e-05, + "loss": 1.2206, + "step": 1590 + }, + { + "epoch": 0.4907503557555479, + "grad_norm": 2.0705955028533936, + "learning_rate": 4.67744111203442e-05, + "loss": 1.1871, + "step": 1595 + }, + { + "epoch": 0.49228875812468753, + "grad_norm": 1.3912644386291504, + "learning_rate": 4.675459386673815e-05, + "loss": 1.2779, + "step": 1600 + }, + { + "epoch": 0.49382716049382713, + "grad_norm": 1.1627599000930786, + "learning_rate": 4.673472014786216e-05, + "loss": 1.0949, + "step": 1605 + }, + { + "epoch": 0.4953655628629668, + "grad_norm": 1.0313643217086792, + "learning_rate": 4.671479001529958e-05, + "loss": 1.2541, + "step": 1610 + }, + { + "epoch": 0.49690396523210645, + "grad_norm": 0.8410005569458008, + "learning_rate": 4.6694803520780204e-05, + "loss": 1.1915, + "step": 1615 + }, + { + "epoch": 0.4984423676012461, + "grad_norm": 1.9390909671783447, + "learning_rate": 4.66747607161801e-05, + "loss": 1.1954, + "step": 1620 + }, + { + "epoch": 0.49998076997038576, + "grad_norm": 1.324788212776184, + "learning_rate": 4.665466165352147e-05, + "loss": 1.163, + "step": 1625 + }, + { + "epoch": 0.5015191723395254, + "grad_norm": 1.9445561170578003, + "learning_rate": 4.663450638497259e-05, + "loss": 1.2891, + "step": 1630 + }, + { + "epoch": 0.5030575747086651, + "grad_norm": 1.2497957944869995, + "learning_rate": 4.661429496284757e-05, + "loss": 1.2295, + "step": 1635 + }, + { + "epoch": 0.5045959770778047, + "grad_norm": 1.6469943523406982, + "learning_rate": 4.65940274396063e-05, + "loss": 1.0742, + "step": 1640 + }, + { + "epoch": 0.5061343794469444, + "grad_norm": 1.3002201318740845, + "learning_rate": 4.657370386785427e-05, + "loss": 1.1575, + "step": 1645 + }, + { + "epoch": 0.507672781816084, + "grad_norm": 1.3192851543426514, + "learning_rate": 4.6553324300342446e-05, + "loss": 1.2364, + "step": 1650 + }, + { + "epoch": 0.5092111841852236, + "grad_norm": 2.825892925262451, + "learning_rate": 4.653288878996716e-05, + "loss": 1.3216, + "step": 1655 + }, + { + "epoch": 0.5107495865543633, + "grad_norm": 1.0338250398635864, + "learning_rate": 4.651239738976991e-05, + "loss": 1.1948, + "step": 1660 + }, + { + "epoch": 0.5122879889235029, + "grad_norm": 1.3987075090408325, + "learning_rate": 4.649185015293728e-05, + "loss": 1.1198, + "step": 1665 + }, + { + "epoch": 0.5138263912926426, + "grad_norm": 1.920446515083313, + "learning_rate": 4.647124713280078e-05, + "loss": 1.1055, + "step": 1670 + }, + { + "epoch": 0.5153647936617822, + "grad_norm": 2.621272325515747, + "learning_rate": 4.645058838283669e-05, + "loss": 1.1013, + "step": 1675 + }, + { + "epoch": 0.5169031960309219, + "grad_norm": 1.3560134172439575, + "learning_rate": 4.642987395666598e-05, + "loss": 1.2337, + "step": 1680 + }, + { + "epoch": 0.5184415984000615, + "grad_norm": 1.2603050470352173, + "learning_rate": 4.64091039080541e-05, + "loss": 1.1338, + "step": 1685 + }, + { + "epoch": 0.5199800007692011, + "grad_norm": 1.3172502517700195, + "learning_rate": 4.638827829091086e-05, + "loss": 1.2974, + "step": 1690 + }, + { + "epoch": 0.5215184031383409, + "grad_norm": 1.0840466022491455, + "learning_rate": 4.636739715929033e-05, + "loss": 1.0235, + "step": 1695 + }, + { + "epoch": 0.5230568055074805, + "grad_norm": 0.9899886250495911, + "learning_rate": 4.634646056739066e-05, + "loss": 1.3254, + "step": 1700 + }, + { + "epoch": 0.5245952078766202, + "grad_norm": 1.1562014818191528, + "learning_rate": 4.632546856955396e-05, + "loss": 1.2105, + "step": 1705 + }, + { + "epoch": 0.5261336102457598, + "grad_norm": 0.9220596551895142, + "learning_rate": 4.630442122026613e-05, + "loss": 1.2339, + "step": 1710 + }, + { + "epoch": 0.5276720126148994, + "grad_norm": 2.6827752590179443, + "learning_rate": 4.628331857415673e-05, + "loss": 1.1655, + "step": 1715 + }, + { + "epoch": 0.5292104149840391, + "grad_norm": 1.4331889152526855, + "learning_rate": 4.626216068599889e-05, + "loss": 1.1398, + "step": 1720 + }, + { + "epoch": 0.5307488173531787, + "grad_norm": 1.2920069694519043, + "learning_rate": 4.624094761070909e-05, + "loss": 1.155, + "step": 1725 + }, + { + "epoch": 0.5322872197223184, + "grad_norm": 0.9748442769050598, + "learning_rate": 4.621967940334705e-05, + "loss": 1.162, + "step": 1730 + }, + { + "epoch": 0.533825622091458, + "grad_norm": 1.2728887796401978, + "learning_rate": 4.6198356119115595e-05, + "loss": 1.2505, + "step": 1735 + }, + { + "epoch": 0.5353640244605977, + "grad_norm": 1.119135856628418, + "learning_rate": 4.617697781336052e-05, + "loss": 1.2112, + "step": 1740 + }, + { + "epoch": 0.5369024268297373, + "grad_norm": 1.0900635719299316, + "learning_rate": 4.6155544541570406e-05, + "loss": 1.1836, + "step": 1745 + }, + { + "epoch": 0.5384408291988769, + "grad_norm": 1.4258182048797607, + "learning_rate": 4.613405635937651e-05, + "loss": 1.1892, + "step": 1750 + }, + { + "epoch": 0.5399792315680166, + "grad_norm": 1.1550543308258057, + "learning_rate": 4.611251332255264e-05, + "loss": 1.1996, + "step": 1755 + }, + { + "epoch": 0.5415176339371562, + "grad_norm": 1.2096160650253296, + "learning_rate": 4.609091548701493e-05, + "loss": 1.1016, + "step": 1760 + }, + { + "epoch": 0.543056036306296, + "grad_norm": 0.9595054984092712, + "learning_rate": 4.6069262908821784e-05, + "loss": 1.233, + "step": 1765 + }, + { + "epoch": 0.5445944386754356, + "grad_norm": 1.478603482246399, + "learning_rate": 4.604755564417369e-05, + "loss": 1.2789, + "step": 1770 + }, + { + "epoch": 0.5461328410445752, + "grad_norm": 1.2694469690322876, + "learning_rate": 4.602579374941307e-05, + "loss": 1.3355, + "step": 1775 + }, + { + "epoch": 0.5476712434137149, + "grad_norm": 1.3089395761489868, + "learning_rate": 4.600397728102414e-05, + "loss": 1.0967, + "step": 1780 + }, + { + "epoch": 0.5492096457828545, + "grad_norm": 2.684046506881714, + "learning_rate": 4.5982106295632765e-05, + "loss": 1.0866, + "step": 1785 + }, + { + "epoch": 0.5507480481519942, + "grad_norm": 1.414984107017517, + "learning_rate": 4.596018085000633e-05, + "loss": 1.2533, + "step": 1790 + }, + { + "epoch": 0.5522864505211338, + "grad_norm": 1.0765364170074463, + "learning_rate": 4.593820100105355e-05, + "loss": 1.2313, + "step": 1795 + }, + { + "epoch": 0.5538248528902735, + "grad_norm": 0.9591373801231384, + "learning_rate": 4.5916166805824353e-05, + "loss": 1.1038, + "step": 1800 + }, + { + "epoch": 0.5553632552594131, + "grad_norm": 1.9848989248275757, + "learning_rate": 4.589407832150974e-05, + "loss": 1.1897, + "step": 1805 + }, + { + "epoch": 0.5569016576285527, + "grad_norm": 1.1997350454330444, + "learning_rate": 4.5871935605441606e-05, + "loss": 1.1637, + "step": 1810 + }, + { + "epoch": 0.5584400599976924, + "grad_norm": 0.9119892716407776, + "learning_rate": 4.5849738715092624e-05, + "loss": 1.3695, + "step": 1815 + }, + { + "epoch": 0.559978462366832, + "grad_norm": 1.0208933353424072, + "learning_rate": 4.582748770807605e-05, + "loss": 1.1085, + "step": 1820 + }, + { + "epoch": 0.5615168647359717, + "grad_norm": 1.3944216966629028, + "learning_rate": 4.580518264214564e-05, + "loss": 1.2322, + "step": 1825 + }, + { + "epoch": 0.5630552671051113, + "grad_norm": 0.9918113946914673, + "learning_rate": 4.5782823575195444e-05, + "loss": 1.0482, + "step": 1830 + }, + { + "epoch": 0.5645936694742509, + "grad_norm": 0.9428990483283997, + "learning_rate": 4.576041056525966e-05, + "loss": 1.132, + "step": 1835 + }, + { + "epoch": 0.5661320718433906, + "grad_norm": 1.1514095067977905, + "learning_rate": 4.5737943670512534e-05, + "loss": 1.2385, + "step": 1840 + }, + { + "epoch": 0.5676704742125303, + "grad_norm": 2.3423311710357666, + "learning_rate": 4.5715422949268136e-05, + "loss": 1.2718, + "step": 1845 + }, + { + "epoch": 0.56920887658167, + "grad_norm": 1.2614145278930664, + "learning_rate": 4.5692848459980275e-05, + "loss": 1.1639, + "step": 1850 + }, + { + "epoch": 0.5707472789508096, + "grad_norm": 1.355564832687378, + "learning_rate": 4.56702202612423e-05, + "loss": 1.2753, + "step": 1855 + }, + { + "epoch": 0.5722856813199493, + "grad_norm": 1.5028793811798096, + "learning_rate": 4.564753841178697e-05, + "loss": 1.2298, + "step": 1860 + }, + { + "epoch": 0.5738240836890889, + "grad_norm": 2.331164598464966, + "learning_rate": 4.5624802970486295e-05, + "loss": 1.1079, + "step": 1865 + }, + { + "epoch": 0.5753624860582285, + "grad_norm": 1.438877820968628, + "learning_rate": 4.56020139963514e-05, + "loss": 1.1423, + "step": 1870 + }, + { + "epoch": 0.5769008884273682, + "grad_norm": 1.6183146238327026, + "learning_rate": 4.557917154853234e-05, + "loss": 1.3155, + "step": 1875 + }, + { + "epoch": 0.5784392907965078, + "grad_norm": 1.206664800643921, + "learning_rate": 4.555627568631798e-05, + "loss": 1.1703, + "step": 1880 + }, + { + "epoch": 0.5799776931656475, + "grad_norm": 0.9220191240310669, + "learning_rate": 4.553332646913581e-05, + "loss": 1.2033, + "step": 1885 + }, + { + "epoch": 0.5815160955347871, + "grad_norm": 1.4525690078735352, + "learning_rate": 4.551032395655181e-05, + "loss": 1.128, + "step": 1890 + }, + { + "epoch": 0.5830544979039268, + "grad_norm": 0.9040252566337585, + "learning_rate": 4.5487268208270284e-05, + "loss": 1.1954, + "step": 1895 + }, + { + "epoch": 0.5845929002730664, + "grad_norm": 1.0403451919555664, + "learning_rate": 4.546415928413373e-05, + "loss": 1.1627, + "step": 1900 + }, + { + "epoch": 0.586131302642206, + "grad_norm": 0.8466317057609558, + "learning_rate": 4.544099724412267e-05, + "loss": 1.2379, + "step": 1905 + }, + { + "epoch": 0.5876697050113457, + "grad_norm": 1.0065761804580688, + "learning_rate": 4.5417782148355464e-05, + "loss": 1.2146, + "step": 1910 + }, + { + "epoch": 0.5892081073804853, + "grad_norm": 1.1972275972366333, + "learning_rate": 4.5394514057088197e-05, + "loss": 1.1367, + "step": 1915 + }, + { + "epoch": 0.5907465097496251, + "grad_norm": 1.0356810092926025, + "learning_rate": 4.5371193030714524e-05, + "loss": 1.123, + "step": 1920 + }, + { + "epoch": 0.5922849121187647, + "grad_norm": 2.044755458831787, + "learning_rate": 4.534781912976546e-05, + "loss": 1.1501, + "step": 1925 + }, + { + "epoch": 0.5938233144879043, + "grad_norm": 4.12185525894165, + "learning_rate": 4.532439241490928e-05, + "loss": 1.1789, + "step": 1930 + }, + { + "epoch": 0.595361716857044, + "grad_norm": 1.5785492658615112, + "learning_rate": 4.530091294695134e-05, + "loss": 1.2626, + "step": 1935 + }, + { + "epoch": 0.5969001192261836, + "grad_norm": 1.0973812341690063, + "learning_rate": 4.527738078683391e-05, + "loss": 1.1505, + "step": 1940 + }, + { + "epoch": 0.5984385215953233, + "grad_norm": 0.8266382813453674, + "learning_rate": 4.525379599563606e-05, + "loss": 1.2476, + "step": 1945 + }, + { + "epoch": 0.5999769239644629, + "grad_norm": 1.5638591051101685, + "learning_rate": 4.5230158634573406e-05, + "loss": 1.0609, + "step": 1950 + }, + { + "epoch": 0.6015153263336026, + "grad_norm": 1.0881614685058594, + "learning_rate": 4.5206468764998065e-05, + "loss": 1.1339, + "step": 1955 + }, + { + "epoch": 0.6030537287027422, + "grad_norm": 1.1539703607559204, + "learning_rate": 4.518272644839843e-05, + "loss": 1.0505, + "step": 1960 + }, + { + "epoch": 0.6045921310718818, + "grad_norm": 1.331925630569458, + "learning_rate": 4.515893174639899e-05, + "loss": 1.1124, + "step": 1965 + }, + { + "epoch": 0.6061305334410215, + "grad_norm": 1.0339001417160034, + "learning_rate": 4.5135084720760254e-05, + "loss": 1.1829, + "step": 1970 + }, + { + "epoch": 0.6076689358101611, + "grad_norm": 0.8567355275154114, + "learning_rate": 4.5111185433378514e-05, + "loss": 1.1963, + "step": 1975 + }, + { + "epoch": 0.6092073381793008, + "grad_norm": 1.3100231885910034, + "learning_rate": 4.50872339462857e-05, + "loss": 1.2722, + "step": 1980 + }, + { + "epoch": 0.6107457405484404, + "grad_norm": 1.4597526788711548, + "learning_rate": 4.506323032164925e-05, + "loss": 1.3127, + "step": 1985 + }, + { + "epoch": 0.61228414291758, + "grad_norm": 1.6040161848068237, + "learning_rate": 4.503917462177192e-05, + "loss": 1.2423, + "step": 1990 + }, + { + "epoch": 0.6138225452867198, + "grad_norm": 1.1720243692398071, + "learning_rate": 4.5015066909091625e-05, + "loss": 1.1971, + "step": 1995 + }, + { + "epoch": 0.6153609476558594, + "grad_norm": 0.9491767883300781, + "learning_rate": 4.499090724618129e-05, + "loss": 1.1892, + "step": 2000 + }, + { + "epoch": 0.6168993500249991, + "grad_norm": 1.83636474609375, + "learning_rate": 4.4966695695748686e-05, + "loss": 1.0706, + "step": 2005 + }, + { + "epoch": 0.6184377523941387, + "grad_norm": 1.8804785013198853, + "learning_rate": 4.494243232063623e-05, + "loss": 1.2081, + "step": 2010 + }, + { + "epoch": 0.6199761547632784, + "grad_norm": 1.1638429164886475, + "learning_rate": 4.4918117183820894e-05, + "loss": 1.1153, + "step": 2015 + }, + { + "epoch": 0.621514557132418, + "grad_norm": 1.120448112487793, + "learning_rate": 4.489375034841397e-05, + "loss": 1.26, + "step": 2020 + }, + { + "epoch": 0.6230529595015576, + "grad_norm": 1.4325777292251587, + "learning_rate": 4.486933187766095e-05, + "loss": 1.1023, + "step": 2025 + }, + { + "epoch": 0.6245913618706973, + "grad_norm": 1.2294832468032837, + "learning_rate": 4.484486183494136e-05, + "loss": 1.1005, + "step": 2030 + }, + { + "epoch": 0.6261297642398369, + "grad_norm": 1.6747307777404785, + "learning_rate": 4.4820340283768544e-05, + "loss": 1.2555, + "step": 2035 + }, + { + "epoch": 0.6276681666089766, + "grad_norm": 1.192826509475708, + "learning_rate": 4.479576728778958e-05, + "loss": 1.0987, + "step": 2040 + }, + { + "epoch": 0.6292065689781162, + "grad_norm": 1.767940878868103, + "learning_rate": 4.477114291078506e-05, + "loss": 1.0992, + "step": 2045 + }, + { + "epoch": 0.6307449713472558, + "grad_norm": 1.5168949365615845, + "learning_rate": 4.474646721666893e-05, + "loss": 1.1127, + "step": 2050 + }, + { + "epoch": 0.6322833737163955, + "grad_norm": 2.4314072132110596, + "learning_rate": 4.4721740269488355e-05, + "loss": 1.141, + "step": 2055 + }, + { + "epoch": 0.6338217760855351, + "grad_norm": 1.7688580751419067, + "learning_rate": 4.46969621334235e-05, + "loss": 1.1934, + "step": 2060 + }, + { + "epoch": 0.6353601784546749, + "grad_norm": 1.0726224184036255, + "learning_rate": 4.467213287278741e-05, + "loss": 1.129, + "step": 2065 + }, + { + "epoch": 0.6368985808238145, + "grad_norm": 1.0851497650146484, + "learning_rate": 4.464725255202582e-05, + "loss": 1.1553, + "step": 2070 + }, + { + "epoch": 0.6384369831929542, + "grad_norm": 1.4708757400512695, + "learning_rate": 4.462232123571702e-05, + "loss": 1.1845, + "step": 2075 + }, + { + "epoch": 0.6399753855620938, + "grad_norm": 1.3815021514892578, + "learning_rate": 4.459733898857162e-05, + "loss": 1.164, + "step": 2080 + }, + { + "epoch": 0.6415137879312334, + "grad_norm": 0.9713008999824524, + "learning_rate": 4.4572305875432465e-05, + "loss": 1.2635, + "step": 2085 + }, + { + "epoch": 0.6430521903003731, + "grad_norm": 1.7279797792434692, + "learning_rate": 4.4547221961274385e-05, + "loss": 1.2015, + "step": 2090 + }, + { + "epoch": 0.6445905926695127, + "grad_norm": 3.183756113052368, + "learning_rate": 4.4522087311204096e-05, + "loss": 1.0928, + "step": 2095 + }, + { + "epoch": 0.6461289950386524, + "grad_norm": 2.138756513595581, + "learning_rate": 4.449690199046e-05, + "loss": 1.2496, + "step": 2100 + }, + { + "epoch": 0.647667397407792, + "grad_norm": 1.7964316606521606, + "learning_rate": 4.447166606441201e-05, + "loss": 1.123, + "step": 2105 + }, + { + "epoch": 0.6492057997769316, + "grad_norm": 2.1838841438293457, + "learning_rate": 4.444637959856137e-05, + "loss": 1.0909, + "step": 2110 + }, + { + "epoch": 0.6507442021460713, + "grad_norm": 1.1749359369277954, + "learning_rate": 4.442104265854055e-05, + "loss": 1.275, + "step": 2115 + }, + { + "epoch": 0.6522826045152109, + "grad_norm": 1.2564892768859863, + "learning_rate": 4.439565531011299e-05, + "loss": 1.1814, + "step": 2120 + }, + { + "epoch": 0.6538210068843506, + "grad_norm": 1.1078180074691772, + "learning_rate": 4.4370217619172964e-05, + "loss": 1.1717, + "step": 2125 + }, + { + "epoch": 0.6553594092534902, + "grad_norm": 1.0620262622833252, + "learning_rate": 4.434472965174545e-05, + "loss": 1.2006, + "step": 2130 + }, + { + "epoch": 0.65689781162263, + "grad_norm": 1.668600082397461, + "learning_rate": 4.4319191473985884e-05, + "loss": 1.16, + "step": 2135 + }, + { + "epoch": 0.6584362139917695, + "grad_norm": 0.7764678597450256, + "learning_rate": 4.429360315218005e-05, + "loss": 1.3311, + "step": 2140 + }, + { + "epoch": 0.6599746163609091, + "grad_norm": 0.9446391463279724, + "learning_rate": 4.4267964752743854e-05, + "loss": 1.2131, + "step": 2145 + }, + { + "epoch": 0.6615130187300489, + "grad_norm": 0.8063297271728516, + "learning_rate": 4.4242276342223235e-05, + "loss": 1.15, + "step": 2150 + }, + { + "epoch": 0.6630514210991885, + "grad_norm": 1.4049417972564697, + "learning_rate": 4.421653798729387e-05, + "loss": 1.2054, + "step": 2155 + }, + { + "epoch": 0.6645898234683282, + "grad_norm": 1.716774344444275, + "learning_rate": 4.4190749754761126e-05, + "loss": 1.055, + "step": 2160 + }, + { + "epoch": 0.6661282258374678, + "grad_norm": 2.2953073978424072, + "learning_rate": 4.4164911711559803e-05, + "loss": 1.2199, + "step": 2165 + }, + { + "epoch": 0.6676666282066075, + "grad_norm": 0.9870630502700806, + "learning_rate": 4.4139023924753995e-05, + "loss": 1.1933, + "step": 2170 + }, + { + "epoch": 0.6692050305757471, + "grad_norm": 1.69342839717865, + "learning_rate": 4.41130864615369e-05, + "loss": 1.1602, + "step": 2175 + }, + { + "epoch": 0.6707434329448867, + "grad_norm": 1.0127511024475098, + "learning_rate": 4.408709938923067e-05, + "loss": 1.2509, + "step": 2180 + }, + { + "epoch": 0.6722818353140264, + "grad_norm": 1.2484458684921265, + "learning_rate": 4.40610627752862e-05, + "loss": 1.1144, + "step": 2185 + }, + { + "epoch": 0.673820237683166, + "grad_norm": 1.0753147602081299, + "learning_rate": 4.403497668728299e-05, + "loss": 1.2816, + "step": 2190 + }, + { + "epoch": 0.6753586400523057, + "grad_norm": 1.2890567779541016, + "learning_rate": 4.400884119292894e-05, + "loss": 1.1233, + "step": 2195 + }, + { + "epoch": 0.6768970424214453, + "grad_norm": 1.108391284942627, + "learning_rate": 4.39826563600602e-05, + "loss": 1.198, + "step": 2200 + }, + { + "epoch": 0.6784354447905849, + "grad_norm": 0.9689030051231384, + "learning_rate": 4.395642225664097e-05, + "loss": 1.2354, + "step": 2205 + }, + { + "epoch": 0.6799738471597246, + "grad_norm": 1.4813648462295532, + "learning_rate": 4.393013895076335e-05, + "loss": 1.1986, + "step": 2210 + }, + { + "epoch": 0.6815122495288642, + "grad_norm": 1.4557727575302124, + "learning_rate": 4.3903806510647115e-05, + "loss": 1.2441, + "step": 2215 + }, + { + "epoch": 0.683050651898004, + "grad_norm": 5.211197853088379, + "learning_rate": 4.3877425004639616e-05, + "loss": 1.2142, + "step": 2220 + }, + { + "epoch": 0.6845890542671436, + "grad_norm": 1.0981544256210327, + "learning_rate": 4.385099450121551e-05, + "loss": 1.1192, + "step": 2225 + }, + { + "epoch": 0.6861274566362833, + "grad_norm": 1.4392608404159546, + "learning_rate": 4.3824515068976666e-05, + "loss": 1.0806, + "step": 2230 + }, + { + "epoch": 0.6876658590054229, + "grad_norm": 0.9992257952690125, + "learning_rate": 4.3797986776651934e-05, + "loss": 1.2906, + "step": 2235 + }, + { + "epoch": 0.6892042613745625, + "grad_norm": 1.0846220254898071, + "learning_rate": 4.3771409693096985e-05, + "loss": 1.145, + "step": 2240 + }, + { + "epoch": 0.6907426637437022, + "grad_norm": 1.7244092226028442, + "learning_rate": 4.374478388729414e-05, + "loss": 1.2479, + "step": 2245 + }, + { + "epoch": 0.6922810661128418, + "grad_norm": 1.187263011932373, + "learning_rate": 4.371810942835215e-05, + "loss": 1.2843, + "step": 2250 + }, + { + "epoch": 0.6938194684819815, + "grad_norm": 1.1405870914459229, + "learning_rate": 4.369138638550611e-05, + "loss": 1.2483, + "step": 2255 + }, + { + "epoch": 0.6953578708511211, + "grad_norm": 1.399223804473877, + "learning_rate": 4.3664614828117137e-05, + "loss": 1.1166, + "step": 2260 + }, + { + "epoch": 0.6968962732202607, + "grad_norm": 0.8091042637825012, + "learning_rate": 4.363779482567234e-05, + "loss": 1.2809, + "step": 2265 + }, + { + "epoch": 0.6984346755894004, + "grad_norm": 1.0107675790786743, + "learning_rate": 4.3610926447784534e-05, + "loss": 1.1957, + "step": 2270 + }, + { + "epoch": 0.69997307795854, + "grad_norm": 1.485467791557312, + "learning_rate": 4.3584009764192094e-05, + "loss": 1.1612, + "step": 2275 + }, + { + "epoch": 0.7015114803276797, + "grad_norm": 1.0193709135055542, + "learning_rate": 4.3557044844758796e-05, + "loss": 1.1595, + "step": 2280 + }, + { + "epoch": 0.7030498826968193, + "grad_norm": 1.1308696269989014, + "learning_rate": 4.353003175947359e-05, + "loss": 1.1202, + "step": 2285 + }, + { + "epoch": 0.704588285065959, + "grad_norm": 1.233419418334961, + "learning_rate": 4.3502970578450466e-05, + "loss": 1.1749, + "step": 2290 + }, + { + "epoch": 0.7061266874350987, + "grad_norm": 0.9433870315551758, + "learning_rate": 4.3475861371928225e-05, + "loss": 1.1464, + "step": 2295 + }, + { + "epoch": 0.7076650898042383, + "grad_norm": 1.7299931049346924, + "learning_rate": 4.344870421027036e-05, + "loss": 1.1479, + "step": 2300 + }, + { + "epoch": 0.709203492173378, + "grad_norm": 1.1724836826324463, + "learning_rate": 4.3421499163964784e-05, + "loss": 1.1548, + "step": 2305 + }, + { + "epoch": 0.7107418945425176, + "grad_norm": 1.5910905599594116, + "learning_rate": 4.339424630362373e-05, + "loss": 1.1267, + "step": 2310 + }, + { + "epoch": 0.7122802969116573, + "grad_norm": 1.4339747428894043, + "learning_rate": 4.336694569998354e-05, + "loss": 1.2297, + "step": 2315 + }, + { + "epoch": 0.7138186992807969, + "grad_norm": 0.7568414211273193, + "learning_rate": 4.333959742390444e-05, + "loss": 1.228, + "step": 2320 + }, + { + "epoch": 0.7153571016499365, + "grad_norm": 1.1691060066223145, + "learning_rate": 4.331220154637044e-05, + "loss": 1.0902, + "step": 2325 + }, + { + "epoch": 0.7168955040190762, + "grad_norm": 1.296523094177246, + "learning_rate": 4.328475813848906e-05, + "loss": 1.2011, + "step": 2330 + }, + { + "epoch": 0.7184339063882158, + "grad_norm": 1.0982245206832886, + "learning_rate": 4.325726727149122e-05, + "loss": 1.1082, + "step": 2335 + }, + { + "epoch": 0.7199723087573555, + "grad_norm": 1.2875556945800781, + "learning_rate": 4.3229729016731005e-05, + "loss": 1.2202, + "step": 2340 + }, + { + "epoch": 0.7215107111264951, + "grad_norm": 2.59907865524292, + "learning_rate": 4.320214344568549e-05, + "loss": 1.1947, + "step": 2345 + }, + { + "epoch": 0.7230491134956348, + "grad_norm": 1.3555256128311157, + "learning_rate": 4.317451062995458e-05, + "loss": 1.1544, + "step": 2350 + }, + { + "epoch": 0.7245875158647744, + "grad_norm": 2.351750135421753, + "learning_rate": 4.3146830641260815e-05, + "loss": 1.1858, + "step": 2355 + }, + { + "epoch": 0.726125918233914, + "grad_norm": 1.5076963901519775, + "learning_rate": 4.311910355144914e-05, + "loss": 1.2452, + "step": 2360 + }, + { + "epoch": 0.7276643206030537, + "grad_norm": 1.020558476448059, + "learning_rate": 4.309132943248678e-05, + "loss": 1.1731, + "step": 2365 + }, + { + "epoch": 0.7292027229721934, + "grad_norm": 1.3791615962982178, + "learning_rate": 4.306350835646303e-05, + "loss": 1.1717, + "step": 2370 + }, + { + "epoch": 0.7307411253413331, + "grad_norm": 1.8552170991897583, + "learning_rate": 4.303564039558904e-05, + "loss": 1.1315, + "step": 2375 + }, + { + "epoch": 0.7322795277104727, + "grad_norm": 1.5803345441818237, + "learning_rate": 4.3007725622197674e-05, + "loss": 1.1341, + "step": 2380 + }, + { + "epoch": 0.7338179300796123, + "grad_norm": 1.0923699140548706, + "learning_rate": 4.2979764108743296e-05, + "loss": 1.0577, + "step": 2385 + }, + { + "epoch": 0.735356332448752, + "grad_norm": 0.9132150411605835, + "learning_rate": 4.295175592780158e-05, + "loss": 1.1672, + "step": 2390 + }, + { + "epoch": 0.7368947348178916, + "grad_norm": 1.6605010032653809, + "learning_rate": 4.2923701152069336e-05, + "loss": 1.1087, + "step": 2395 + }, + { + "epoch": 0.7384331371870313, + "grad_norm": 1.2207224369049072, + "learning_rate": 4.28955998543643e-05, + "loss": 1.1708, + "step": 2400 + }, + { + "epoch": 0.7399715395561709, + "grad_norm": 2.0855581760406494, + "learning_rate": 4.286745210762499e-05, + "loss": 1.1594, + "step": 2405 + }, + { + "epoch": 0.7415099419253106, + "grad_norm": 1.4070476293563843, + "learning_rate": 4.283925798491044e-05, + "loss": 1.2085, + "step": 2410 + }, + { + "epoch": 0.7430483442944502, + "grad_norm": 1.0494474172592163, + "learning_rate": 4.281101755940009e-05, + "loss": 1.1877, + "step": 2415 + }, + { + "epoch": 0.7445867466635898, + "grad_norm": 1.0690234899520874, + "learning_rate": 4.2782730904393546e-05, + "loss": 1.0301, + "step": 2420 + }, + { + "epoch": 0.7461251490327295, + "grad_norm": 1.0572775602340698, + "learning_rate": 4.275439809331041e-05, + "loss": 1.1589, + "step": 2425 + }, + { + "epoch": 0.7476635514018691, + "grad_norm": 1.0381057262420654, + "learning_rate": 4.2726019199690093e-05, + "loss": 1.1945, + "step": 2430 + }, + { + "epoch": 0.7492019537710088, + "grad_norm": 2.4231226444244385, + "learning_rate": 4.269759429719159e-05, + "loss": 1.1851, + "step": 2435 + }, + { + "epoch": 0.7507403561401484, + "grad_norm": 1.1113394498825073, + "learning_rate": 4.266912345959335e-05, + "loss": 1.1269, + "step": 2440 + }, + { + "epoch": 0.7522787585092882, + "grad_norm": 1.7000584602355957, + "learning_rate": 4.264060676079302e-05, + "loss": 1.1546, + "step": 2445 + }, + { + "epoch": 0.7538171608784278, + "grad_norm": 4.711215972900391, + "learning_rate": 4.2612044274807295e-05, + "loss": 1.1901, + "step": 2450 + }, + { + "epoch": 0.7553555632475674, + "grad_norm": 1.1425622701644897, + "learning_rate": 4.2583436075771706e-05, + "loss": 1.2222, + "step": 2455 + }, + { + "epoch": 0.7568939656167071, + "grad_norm": 1.1163311004638672, + "learning_rate": 4.255478223794045e-05, + "loss": 1.066, + "step": 2460 + }, + { + "epoch": 0.7584323679858467, + "grad_norm": 0.741192102432251, + "learning_rate": 4.252608283568616e-05, + "loss": 1.3143, + "step": 2465 + }, + { + "epoch": 0.7599707703549864, + "grad_norm": 0.9434729218482971, + "learning_rate": 4.249733794349976e-05, + "loss": 1.1981, + "step": 2470 + }, + { + "epoch": 0.761509172724126, + "grad_norm": 1.200185775756836, + "learning_rate": 4.246854763599022e-05, + "loss": 1.1042, + "step": 2475 + }, + { + "epoch": 0.7630475750932656, + "grad_norm": 1.1456506252288818, + "learning_rate": 4.2439711987884406e-05, + "loss": 1.1917, + "step": 2480 + }, + { + "epoch": 0.7645859774624053, + "grad_norm": 1.4507484436035156, + "learning_rate": 4.241083107402687e-05, + "loss": 1.249, + "step": 2485 + }, + { + "epoch": 0.7661243798315449, + "grad_norm": 1.5303988456726074, + "learning_rate": 4.238190496937962e-05, + "loss": 1.2334, + "step": 2490 + }, + { + "epoch": 0.7676627822006846, + "grad_norm": 1.067226529121399, + "learning_rate": 4.235293374902201e-05, + "loss": 1.211, + "step": 2495 + }, + { + "epoch": 0.7692011845698242, + "grad_norm": 1.0933020114898682, + "learning_rate": 4.232391748815046e-05, + "loss": 1.1696, + "step": 2500 + }, + { + "epoch": 0.7707395869389639, + "grad_norm": 0.9041593670845032, + "learning_rate": 4.2294856262078296e-05, + "loss": 1.0995, + "step": 2505 + }, + { + "epoch": 0.7722779893081035, + "grad_norm": 0.886325478553772, + "learning_rate": 4.226575014623557e-05, + "loss": 1.2787, + "step": 2510 + }, + { + "epoch": 0.7738163916772431, + "grad_norm": 0.8392224311828613, + "learning_rate": 4.223659921616885e-05, + "loss": 1.1928, + "step": 2515 + }, + { + "epoch": 0.7753547940463829, + "grad_norm": 2.0462377071380615, + "learning_rate": 4.2207403547541e-05, + "loss": 1.2198, + "step": 2520 + }, + { + "epoch": 0.7768931964155225, + "grad_norm": 0.8376229405403137, + "learning_rate": 4.2178163216131015e-05, + "loss": 1.1459, + "step": 2525 + }, + { + "epoch": 0.7784315987846622, + "grad_norm": 1.8360308408737183, + "learning_rate": 4.214887829783383e-05, + "loss": 1.2045, + "step": 2530 + }, + { + "epoch": 0.7799700011538018, + "grad_norm": 1.962247610092163, + "learning_rate": 4.2119548868660084e-05, + "loss": 1.1892, + "step": 2535 + }, + { + "epoch": 0.7815084035229414, + "grad_norm": 1.0114338397979736, + "learning_rate": 4.209017500473596e-05, + "loss": 1.2576, + "step": 2540 + }, + { + "epoch": 0.7830468058920811, + "grad_norm": 2.040184259414673, + "learning_rate": 4.206075678230297e-05, + "loss": 1.0698, + "step": 2545 + }, + { + "epoch": 0.7845852082612207, + "grad_norm": 1.3583984375, + "learning_rate": 4.203129427771776e-05, + "loss": 1.1858, + "step": 2550 + }, + { + "epoch": 0.7861236106303604, + "grad_norm": 2.056542158126831, + "learning_rate": 4.200178756745192e-05, + "loss": 1.2202, + "step": 2555 + }, + { + "epoch": 0.7876620129995, + "grad_norm": 1.1544386148452759, + "learning_rate": 4.197223672809177e-05, + "loss": 1.1182, + "step": 2560 + }, + { + "epoch": 0.7892004153686397, + "grad_norm": 1.046268343925476, + "learning_rate": 4.194264183633818e-05, + "loss": 1.1568, + "step": 2565 + }, + { + "epoch": 0.7907388177377793, + "grad_norm": 1.4613105058670044, + "learning_rate": 4.1913002969006344e-05, + "loss": 1.2269, + "step": 2570 + }, + { + "epoch": 0.7922772201069189, + "grad_norm": 0.9770887494087219, + "learning_rate": 4.188332020302561e-05, + "loss": 1.0355, + "step": 2575 + }, + { + "epoch": 0.7938156224760586, + "grad_norm": 1.2040941715240479, + "learning_rate": 4.185359361543927e-05, + "loss": 1.2607, + "step": 2580 + }, + { + "epoch": 0.7953540248451982, + "grad_norm": 1.4072517156600952, + "learning_rate": 4.182382328340434e-05, + "loss": 1.2368, + "step": 2585 + }, + { + "epoch": 0.796892427214338, + "grad_norm": 1.2777091264724731, + "learning_rate": 4.17940092841914e-05, + "loss": 1.3141, + "step": 2590 + }, + { + "epoch": 0.7984308295834776, + "grad_norm": 0.8566577434539795, + "learning_rate": 4.176415169518434e-05, + "loss": 1.1347, + "step": 2595 + }, + { + "epoch": 0.7999692319526172, + "grad_norm": 1.2150745391845703, + "learning_rate": 4.173425059388023e-05, + "loss": 0.99, + "step": 2600 + }, + { + "epoch": 0.8015076343217569, + "grad_norm": 1.1194686889648438, + "learning_rate": 4.1704306057889053e-05, + "loss": 1.1714, + "step": 2605 + }, + { + "epoch": 0.8030460366908965, + "grad_norm": 1.402685523033142, + "learning_rate": 4.167431816493352e-05, + "loss": 1.2142, + "step": 2610 + }, + { + "epoch": 0.8045844390600362, + "grad_norm": 1.0395668745040894, + "learning_rate": 4.1644286992848916e-05, + "loss": 1.1401, + "step": 2615 + }, + { + "epoch": 0.8061228414291758, + "grad_norm": 0.9008033871650696, + "learning_rate": 4.161421261958281e-05, + "loss": 1.2895, + "step": 2620 + }, + { + "epoch": 0.8076612437983155, + "grad_norm": 1.628246545791626, + "learning_rate": 4.158409512319493e-05, + "loss": 1.0513, + "step": 2625 + }, + { + "epoch": 0.8091996461674551, + "grad_norm": 1.2516933679580688, + "learning_rate": 4.1553934581856945e-05, + "loss": 1.1017, + "step": 2630 + }, + { + "epoch": 0.8107380485365947, + "grad_norm": 1.1752599477767944, + "learning_rate": 4.1523731073852215e-05, + "loss": 1.2321, + "step": 2635 + }, + { + "epoch": 0.8122764509057344, + "grad_norm": 1.0307414531707764, + "learning_rate": 4.149348467757566e-05, + "loss": 1.3027, + "step": 2640 + }, + { + "epoch": 0.813814853274874, + "grad_norm": 1.7551286220550537, + "learning_rate": 4.1463195471533476e-05, + "loss": 1.0455, + "step": 2645 + }, + { + "epoch": 0.8153532556440137, + "grad_norm": 1.2423598766326904, + "learning_rate": 4.1432863534343016e-05, + "loss": 1.0785, + "step": 2650 + }, + { + "epoch": 0.8168916580131533, + "grad_norm": 2.572627305984497, + "learning_rate": 4.140248894473253e-05, + "loss": 1.2205, + "step": 2655 + }, + { + "epoch": 0.8184300603822929, + "grad_norm": 1.2887595891952515, + "learning_rate": 4.137207178154095e-05, + "loss": 1.0928, + "step": 2660 + }, + { + "epoch": 0.8199684627514326, + "grad_norm": 1.1292917728424072, + "learning_rate": 4.134161212371776e-05, + "loss": 1.0875, + "step": 2665 + }, + { + "epoch": 0.8215068651205722, + "grad_norm": 1.0025163888931274, + "learning_rate": 4.1311110050322674e-05, + "loss": 1.3263, + "step": 2670 + }, + { + "epoch": 0.823045267489712, + "grad_norm": 1.1535359621047974, + "learning_rate": 4.128056564052558e-05, + "loss": 1.0747, + "step": 2675 + }, + { + "epoch": 0.8245836698588516, + "grad_norm": 1.1695550680160522, + "learning_rate": 4.124997897360617e-05, + "loss": 1.1265, + "step": 2680 + }, + { + "epoch": 0.8261220722279913, + "grad_norm": 0.9830222725868225, + "learning_rate": 4.1219350128953885e-05, + "loss": 1.1475, + "step": 2685 + }, + { + "epoch": 0.8276604745971309, + "grad_norm": 1.1742221117019653, + "learning_rate": 4.118867918606759e-05, + "loss": 1.2344, + "step": 2690 + }, + { + "epoch": 0.8291988769662705, + "grad_norm": 0.9631347060203552, + "learning_rate": 4.115796622455544e-05, + "loss": 1.1101, + "step": 2695 + }, + { + "epoch": 0.8307372793354102, + "grad_norm": 0.9511464238166809, + "learning_rate": 4.112721132413467e-05, + "loss": 1.2955, + "step": 2700 + }, + { + "epoch": 0.8322756817045498, + "grad_norm": 1.5212576389312744, + "learning_rate": 4.109641456463135e-05, + "loss": 1.1907, + "step": 2705 + }, + { + "epoch": 0.8338140840736895, + "grad_norm": 1.1152446269989014, + "learning_rate": 4.106557602598017e-05, + "loss": 1.143, + "step": 2710 + }, + { + "epoch": 0.8353524864428291, + "grad_norm": 1.1330682039260864, + "learning_rate": 4.103469578822432e-05, + "loss": 1.1143, + "step": 2715 + }, + { + "epoch": 0.8368908888119688, + "grad_norm": 1.170376181602478, + "learning_rate": 4.1003773931515175e-05, + "loss": 1.1621, + "step": 2720 + }, + { + "epoch": 0.8384292911811084, + "grad_norm": 1.571735143661499, + "learning_rate": 4.097281053611215e-05, + "loss": 1.1911, + "step": 2725 + }, + { + "epoch": 0.839967693550248, + "grad_norm": 0.9364776611328125, + "learning_rate": 4.0941805682382484e-05, + "loss": 1.171, + "step": 2730 + }, + { + "epoch": 0.8415060959193877, + "grad_norm": 1.2696608304977417, + "learning_rate": 4.091075945080101e-05, + "loss": 1.2642, + "step": 2735 + }, + { + "epoch": 0.8430444982885273, + "grad_norm": 4.6916584968566895, + "learning_rate": 4.087967192194997e-05, + "loss": 1.2212, + "step": 2740 + }, + { + "epoch": 0.8445829006576671, + "grad_norm": 0.9691728353500366, + "learning_rate": 4.0848543176518784e-05, + "loss": 1.1457, + "step": 2745 + }, + { + "epoch": 0.8461213030268067, + "grad_norm": 1.154585599899292, + "learning_rate": 4.081737329530386e-05, + "loss": 1.2012, + "step": 2750 + }, + { + "epoch": 0.8476597053959463, + "grad_norm": 1.3288275003433228, + "learning_rate": 4.0786162359208386e-05, + "loss": 1.2624, + "step": 2755 + }, + { + "epoch": 0.849198107765086, + "grad_norm": 4.425066947937012, + "learning_rate": 4.075491044924209e-05, + "loss": 1.0656, + "step": 2760 + }, + { + "epoch": 0.8507365101342256, + "grad_norm": 0.917151927947998, + "learning_rate": 4.072361764652105e-05, + "loss": 1.0405, + "step": 2765 + }, + { + "epoch": 0.8522749125033653, + "grad_norm": 0.9385243654251099, + "learning_rate": 4.0692284032267516e-05, + "loss": 1.2393, + "step": 2770 + }, + { + "epoch": 0.8538133148725049, + "grad_norm": 1.1418960094451904, + "learning_rate": 4.0660909687809625e-05, + "loss": 1.2778, + "step": 2775 + }, + { + "epoch": 0.8553517172416446, + "grad_norm": 0.9725258350372314, + "learning_rate": 4.062949469458125e-05, + "loss": 1.1223, + "step": 2780 + }, + { + "epoch": 0.8568901196107842, + "grad_norm": 0.8571771383285522, + "learning_rate": 4.059803913412178e-05, + "loss": 1.1825, + "step": 2785 + }, + { + "epoch": 0.8584285219799238, + "grad_norm": 1.6413207054138184, + "learning_rate": 4.056654308807588e-05, + "loss": 1.1504, + "step": 2790 + }, + { + "epoch": 0.8599669243490635, + "grad_norm": 1.0783355236053467, + "learning_rate": 4.053500663819331e-05, + "loss": 1.2997, + "step": 2795 + }, + { + "epoch": 0.8615053267182031, + "grad_norm": 1.5912343263626099, + "learning_rate": 4.05034298663287e-05, + "loss": 1.201, + "step": 2800 + }, + { + "epoch": 0.8630437290873428, + "grad_norm": 1.5445067882537842, + "learning_rate": 4.047181285444133e-05, + "loss": 1.0921, + "step": 2805 + }, + { + "epoch": 0.8645821314564824, + "grad_norm": 1.2195634841918945, + "learning_rate": 4.0440155684594915e-05, + "loss": 1.1598, + "step": 2810 + }, + { + "epoch": 0.866120533825622, + "grad_norm": 1.3299264907836914, + "learning_rate": 4.0408458438957454e-05, + "loss": 1.2355, + "step": 2815 + }, + { + "epoch": 0.8676589361947618, + "grad_norm": 1.7162269353866577, + "learning_rate": 4.0376721199800896e-05, + "loss": 1.1072, + "step": 2820 + }, + { + "epoch": 0.8691973385639014, + "grad_norm": 1.240971326828003, + "learning_rate": 4.034494404950103e-05, + "loss": 1.184, + "step": 2825 + }, + { + "epoch": 0.8707357409330411, + "grad_norm": 1.2077282667160034, + "learning_rate": 4.0313127070537244e-05, + "loss": 1.1602, + "step": 2830 + }, + { + "epoch": 0.8722741433021807, + "grad_norm": 0.9052446484565735, + "learning_rate": 4.028127034549229e-05, + "loss": 1.1715, + "step": 2835 + }, + { + "epoch": 0.8738125456713204, + "grad_norm": 1.6603598594665527, + "learning_rate": 4.024937395705209e-05, + "loss": 1.161, + "step": 2840 + }, + { + "epoch": 0.87535094804046, + "grad_norm": 1.1748160123825073, + "learning_rate": 4.0217437988005515e-05, + "loss": 1.2401, + "step": 2845 + }, + { + "epoch": 0.8768893504095996, + "grad_norm": 0.7816171646118164, + "learning_rate": 4.0185462521244146e-05, + "loss": 1.289, + "step": 2850 + }, + { + "epoch": 0.8784277527787393, + "grad_norm": 2.378026247024536, + "learning_rate": 4.015344763976212e-05, + "loss": 1.1013, + "step": 2855 + }, + { + "epoch": 0.8799661551478789, + "grad_norm": 0.9903531670570374, + "learning_rate": 4.012139342665586e-05, + "loss": 1.1572, + "step": 2860 + }, + { + "epoch": 0.8815045575170186, + "grad_norm": 1.0575708150863647, + "learning_rate": 4.0089299965123875e-05, + "loss": 1.1393, + "step": 2865 + }, + { + "epoch": 0.8830429598861582, + "grad_norm": 1.2809288501739502, + "learning_rate": 4.005716733846653e-05, + "loss": 1.1285, + "step": 2870 + }, + { + "epoch": 0.8845813622552978, + "grad_norm": 2.3209915161132812, + "learning_rate": 4.0024995630085885e-05, + "loss": 1.1748, + "step": 2875 + }, + { + "epoch": 0.8861197646244375, + "grad_norm": 1.7122613191604614, + "learning_rate": 3.999278492348539e-05, + "loss": 1.188, + "step": 2880 + }, + { + "epoch": 0.8876581669935771, + "grad_norm": 1.0947169065475464, + "learning_rate": 3.996053530226977e-05, + "loss": 1.1547, + "step": 2885 + }, + { + "epoch": 0.8891965693627168, + "grad_norm": 2.022311210632324, + "learning_rate": 3.992824685014471e-05, + "loss": 1.1289, + "step": 2890 + }, + { + "epoch": 0.8907349717318565, + "grad_norm": 1.397825837135315, + "learning_rate": 3.9895919650916716e-05, + "loss": 1.2032, + "step": 2895 + }, + { + "epoch": 0.8922733741009962, + "grad_norm": 0.9696956276893616, + "learning_rate": 3.9863553788492834e-05, + "loss": 1.0806, + "step": 2900 + }, + { + "epoch": 0.8938117764701358, + "grad_norm": 1.9239165782928467, + "learning_rate": 3.983114934688048e-05, + "loss": 1.1205, + "step": 2905 + }, + { + "epoch": 0.8953501788392754, + "grad_norm": 1.3101149797439575, + "learning_rate": 3.97987064101872e-05, + "loss": 1.1769, + "step": 2910 + }, + { + "epoch": 0.8968885812084151, + "grad_norm": 0.9820067286491394, + "learning_rate": 3.976622506262047e-05, + "loss": 1.1518, + "step": 2915 + }, + { + "epoch": 0.8984269835775547, + "grad_norm": 1.2778306007385254, + "learning_rate": 3.973370538848744e-05, + "loss": 1.156, + "step": 2920 + }, + { + "epoch": 0.8999653859466944, + "grad_norm": 2.279306411743164, + "learning_rate": 3.970114747219475e-05, + "loss": 1.0314, + "step": 2925 + }, + { + "epoch": 0.901503788315834, + "grad_norm": 1.2938816547393799, + "learning_rate": 3.966855139824831e-05, + "loss": 1.1778, + "step": 2930 + }, + { + "epoch": 0.9030421906849736, + "grad_norm": 0.8507199883460999, + "learning_rate": 3.963591725125305e-05, + "loss": 1.1217, + "step": 2935 + }, + { + "epoch": 0.9045805930541133, + "grad_norm": 1.3915897607803345, + "learning_rate": 3.9603245115912736e-05, + "loss": 1.2169, + "step": 2940 + }, + { + "epoch": 0.9061189954232529, + "grad_norm": 1.3529155254364014, + "learning_rate": 3.9570535077029724e-05, + "loss": 1.1911, + "step": 2945 + }, + { + "epoch": 0.9076573977923926, + "grad_norm": 1.5553487539291382, + "learning_rate": 3.953778721950477e-05, + "loss": 1.1837, + "step": 2950 + }, + { + "epoch": 0.9091958001615322, + "grad_norm": 1.1085933446884155, + "learning_rate": 3.9505001628336757e-05, + "loss": 1.2342, + "step": 2955 + }, + { + "epoch": 0.9107342025306719, + "grad_norm": 1.0837507247924805, + "learning_rate": 3.947217838862255e-05, + "loss": 1.2045, + "step": 2960 + }, + { + "epoch": 0.9122726048998115, + "grad_norm": 0.9050890207290649, + "learning_rate": 3.943931758555669e-05, + "loss": 1.2006, + "step": 2965 + }, + { + "epoch": 0.9138110072689511, + "grad_norm": 1.1101486682891846, + "learning_rate": 3.940641930443125e-05, + "loss": 1.1462, + "step": 2970 + }, + { + "epoch": 0.9153494096380909, + "grad_norm": 2.048758029937744, + "learning_rate": 3.9373483630635564e-05, + "loss": 1.1595, + "step": 2975 + }, + { + "epoch": 0.9168878120072305, + "grad_norm": 1.0824599266052246, + "learning_rate": 3.934051064965602e-05, + "loss": 1.1756, + "step": 2980 + }, + { + "epoch": 0.9184262143763702, + "grad_norm": 2.319460153579712, + "learning_rate": 3.9307500447075844e-05, + "loss": 1.1852, + "step": 2985 + }, + { + "epoch": 0.9199646167455098, + "grad_norm": 0.8897768259048462, + "learning_rate": 3.927445310857487e-05, + "loss": 1.0803, + "step": 2990 + }, + { + "epoch": 0.9215030191146495, + "grad_norm": 0.7820705771446228, + "learning_rate": 3.924136871992932e-05, + "loss": 1.2017, + "step": 2995 + }, + { + "epoch": 0.9230414214837891, + "grad_norm": 1.2413580417633057, + "learning_rate": 3.9208247367011574e-05, + "loss": 1.1179, + "step": 3000 + }, + { + "epoch": 0.9245798238529287, + "grad_norm": 2.127316951751709, + "learning_rate": 3.9175089135789987e-05, + "loss": 1.059, + "step": 3005 + }, + { + "epoch": 0.9261182262220684, + "grad_norm": 2.3183646202087402, + "learning_rate": 3.914189411232858e-05, + "loss": 1.1931, + "step": 3010 + }, + { + "epoch": 0.927656628591208, + "grad_norm": 1.7735154628753662, + "learning_rate": 3.9108662382786925e-05, + "loss": 1.1452, + "step": 3015 + }, + { + "epoch": 0.9291950309603477, + "grad_norm": 1.7767513990402222, + "learning_rate": 3.9075394033419826e-05, + "loss": 1.0446, + "step": 3020 + }, + { + "epoch": 0.9307334333294873, + "grad_norm": 0.9649571776390076, + "learning_rate": 3.904208915057716e-05, + "loss": 1.1965, + "step": 3025 + }, + { + "epoch": 0.9322718356986269, + "grad_norm": 1.123539686203003, + "learning_rate": 3.900874782070362e-05, + "loss": 1.1604, + "step": 3030 + }, + { + "epoch": 0.9338102380677666, + "grad_norm": 1.48845374584198, + "learning_rate": 3.897537013033849e-05, + "loss": 1.1589, + "step": 3035 + }, + { + "epoch": 0.9353486404369062, + "grad_norm": 0.8311707377433777, + "learning_rate": 3.8941956166115454e-05, + "loss": 1.2621, + "step": 3040 + }, + { + "epoch": 0.936887042806046, + "grad_norm": 1.6711184978485107, + "learning_rate": 3.890850601476233e-05, + "loss": 1.2114, + "step": 3045 + }, + { + "epoch": 0.9384254451751856, + "grad_norm": 1.2482969760894775, + "learning_rate": 3.887501976310086e-05, + "loss": 1.1918, + "step": 3050 + }, + { + "epoch": 0.9399638475443253, + "grad_norm": 1.0476139783859253, + "learning_rate": 3.884149749804648e-05, + "loss": 1.1405, + "step": 3055 + }, + { + "epoch": 0.9415022499134649, + "grad_norm": 0.864454984664917, + "learning_rate": 3.880793930660813e-05, + "loss": 1.1581, + "step": 3060 + }, + { + "epoch": 0.9430406522826045, + "grad_norm": 1.3563313484191895, + "learning_rate": 3.877434527588798e-05, + "loss": 1.1005, + "step": 3065 + }, + { + "epoch": 0.9445790546517442, + "grad_norm": 1.5107038021087646, + "learning_rate": 3.8740715493081203e-05, + "loss": 1.0558, + "step": 3070 + }, + { + "epoch": 0.9461174570208838, + "grad_norm": 1.4398322105407715, + "learning_rate": 3.87070500454758e-05, + "loss": 1.2505, + "step": 3075 + }, + { + "epoch": 0.9476558593900235, + "grad_norm": 1.0815845727920532, + "learning_rate": 3.867334902045234e-05, + "loss": 1.2257, + "step": 3080 + }, + { + "epoch": 0.9491942617591631, + "grad_norm": 1.0675805807113647, + "learning_rate": 3.863961250548371e-05, + "loss": 1.0496, + "step": 3085 + }, + { + "epoch": 0.9507326641283027, + "grad_norm": 1.3433740139007568, + "learning_rate": 3.860584058813495e-05, + "loss": 1.2241, + "step": 3090 + }, + { + "epoch": 0.9522710664974424, + "grad_norm": 0.8536397218704224, + "learning_rate": 3.8572033356062943e-05, + "loss": 1.2073, + "step": 3095 + }, + { + "epoch": 0.953809468866582, + "grad_norm": 1.1579813957214355, + "learning_rate": 3.853819089701627e-05, + "loss": 1.2149, + "step": 3100 + }, + { + "epoch": 0.9553478712357217, + "grad_norm": 1.0714601278305054, + "learning_rate": 3.850431329883493e-05, + "loss": 1.2619, + "step": 3105 + }, + { + "epoch": 0.9568862736048613, + "grad_norm": 1.3584688901901245, + "learning_rate": 3.847040064945014e-05, + "loss": 1.1022, + "step": 3110 + }, + { + "epoch": 0.958424675974001, + "grad_norm": 3.527039051055908, + "learning_rate": 3.843645303688408e-05, + "loss": 1.3198, + "step": 3115 + }, + { + "epoch": 0.9599630783431407, + "grad_norm": 0.9427330493927002, + "learning_rate": 3.840247054924968e-05, + "loss": 1.1009, + "step": 3120 + }, + { + "epoch": 0.9615014807122803, + "grad_norm": 1.2566494941711426, + "learning_rate": 3.836845327475041e-05, + "loss": 1.1355, + "step": 3125 + }, + { + "epoch": 0.96303988308142, + "grad_norm": 1.2846300601959229, + "learning_rate": 3.833440130167999e-05, + "loss": 1.1365, + "step": 3130 + }, + { + "epoch": 0.9645782854505596, + "grad_norm": 0.7137424945831299, + "learning_rate": 3.830031471842226e-05, + "loss": 1.0481, + "step": 3135 + }, + { + "epoch": 0.9661166878196993, + "grad_norm": 2.03141188621521, + "learning_rate": 3.826619361345084e-05, + "loss": 1.32, + "step": 3140 + }, + { + "epoch": 0.9676550901888389, + "grad_norm": 2.1782736778259277, + "learning_rate": 3.823203807532898e-05, + "loss": 1.2055, + "step": 3145 + }, + { + "epoch": 0.9691934925579785, + "grad_norm": 0.9288005232810974, + "learning_rate": 3.8197848192709286e-05, + "loss": 1.1541, + "step": 3150 + }, + { + "epoch": 0.9707318949271182, + "grad_norm": 1.3783565759658813, + "learning_rate": 3.816362405433353e-05, + "loss": 1.213, + "step": 3155 + }, + { + "epoch": 0.9722702972962578, + "grad_norm": 2.103428363800049, + "learning_rate": 3.81293657490324e-05, + "loss": 1.2212, + "step": 3160 + }, + { + "epoch": 0.9738086996653975, + "grad_norm": 0.9342910051345825, + "learning_rate": 3.809507336572522e-05, + "loss": 1.2327, + "step": 3165 + }, + { + "epoch": 0.9753471020345371, + "grad_norm": 1.7723548412322998, + "learning_rate": 3.80607469934198e-05, + "loss": 1.1732, + "step": 3170 + }, + { + "epoch": 0.9768855044036768, + "grad_norm": 1.1316125392913818, + "learning_rate": 3.80263867212122e-05, + "loss": 1.1486, + "step": 3175 + }, + { + "epoch": 0.9784239067728164, + "grad_norm": 0.8731313347816467, + "learning_rate": 3.79919926382864e-05, + "loss": 1.2067, + "step": 3180 + }, + { + "epoch": 0.979962309141956, + "grad_norm": 0.8731054067611694, + "learning_rate": 3.795756483391419e-05, + "loss": 1.1672, + "step": 3185 + }, + { + "epoch": 0.9815007115110957, + "grad_norm": 1.165216326713562, + "learning_rate": 3.792310339745486e-05, + "loss": 1.1711, + "step": 3190 + }, + { + "epoch": 0.9830391138802353, + "grad_norm": 1.8140064477920532, + "learning_rate": 3.788860841835502e-05, + "loss": 1.1883, + "step": 3195 + }, + { + "epoch": 0.9845775162493751, + "grad_norm": 1.9973480701446533, + "learning_rate": 3.785407998614831e-05, + "loss": 1.0442, + "step": 3200 + }, + { + "epoch": 0.9861159186185147, + "grad_norm": 1.0596879720687866, + "learning_rate": 3.781951819045521e-05, + "loss": 1.2081, + "step": 3205 + }, + { + "epoch": 0.9876543209876543, + "grad_norm": 0.9382377862930298, + "learning_rate": 3.778492312098283e-05, + "loss": 1.2106, + "step": 3210 + }, + { + "epoch": 0.989192723356794, + "grad_norm": 1.565450668334961, + "learning_rate": 3.7750294867524585e-05, + "loss": 1.0911, + "step": 3215 + }, + { + "epoch": 0.9907311257259336, + "grad_norm": 1.8136720657348633, + "learning_rate": 3.771563351996006e-05, + "loss": 1.1857, + "step": 3220 + }, + { + "epoch": 0.9922695280950733, + "grad_norm": 1.7879712581634521, + "learning_rate": 3.7680939168254733e-05, + "loss": 1.2005, + "step": 3225 + }, + { + "epoch": 0.9938079304642129, + "grad_norm": 0.8127261996269226, + "learning_rate": 3.7646211902459736e-05, + "loss": 1.1781, + "step": 3230 + }, + { + "epoch": 0.9953463328333526, + "grad_norm": 0.8571304082870483, + "learning_rate": 3.761145181271164e-05, + "loss": 1.2937, + "step": 3235 + }, + { + "epoch": 0.9968847352024922, + "grad_norm": 0.9852257966995239, + "learning_rate": 3.757665898923223e-05, + "loss": 1.0958, + "step": 3240 + }, + { + "epoch": 0.9984231375716318, + "grad_norm": 1.0459402799606323, + "learning_rate": 3.75418335223282e-05, + "loss": 1.353, + "step": 3245 + }, + { + "epoch": 0.9999615399407715, + "grad_norm": 0.9802791476249695, + "learning_rate": 3.750697550239102e-05, + "loss": 1.1591, + "step": 3250 + }, + { + "epoch": 1.0014999423099111, + "grad_norm": 1.1630258560180664, + "learning_rate": 3.747208501989663e-05, + "loss": 1.1936, + "step": 3255 + }, + { + "epoch": 1.0030383446790507, + "grad_norm": 1.5038493871688843, + "learning_rate": 3.743716216540526e-05, + "loss": 1.1801, + "step": 3260 + }, + { + "epoch": 1.0045767470481906, + "grad_norm": 3.152625799179077, + "learning_rate": 3.7402207029561116e-05, + "loss": 1.0709, + "step": 3265 + }, + { + "epoch": 1.0061151494173302, + "grad_norm": 0.881455659866333, + "learning_rate": 3.736721970309222e-05, + "loss": 1.2295, + "step": 3270 + }, + { + "epoch": 1.0076535517864698, + "grad_norm": 1.6096093654632568, + "learning_rate": 3.7332200276810145e-05, + "loss": 1.1981, + "step": 3275 + }, + { + "epoch": 1.0091919541556094, + "grad_norm": 0.9019604325294495, + "learning_rate": 3.7297148841609785e-05, + "loss": 1.1362, + "step": 3280 + }, + { + "epoch": 1.010730356524749, + "grad_norm": 1.8702175617218018, + "learning_rate": 3.72620654884691e-05, + "loss": 1.0372, + "step": 3285 + }, + { + "epoch": 1.0122687588938888, + "grad_norm": 0.9774813652038574, + "learning_rate": 3.722695030844891e-05, + "loss": 1.0715, + "step": 3290 + }, + { + "epoch": 1.0138071612630284, + "grad_norm": 2.0321578979492188, + "learning_rate": 3.7191803392692626e-05, + "loss": 1.2701, + "step": 3295 + }, + { + "epoch": 1.015345563632168, + "grad_norm": 0.8944913744926453, + "learning_rate": 3.715662483242605e-05, + "loss": 1.1084, + "step": 3300 + }, + { + "epoch": 1.0168839660013076, + "grad_norm": 0.9244990944862366, + "learning_rate": 3.712141471895711e-05, + "loss": 1.1145, + "step": 3305 + }, + { + "epoch": 1.0184223683704472, + "grad_norm": 1.136596441268921, + "learning_rate": 3.708617314367562e-05, + "loss": 1.0873, + "step": 3310 + }, + { + "epoch": 1.019960770739587, + "grad_norm": 0.8394299745559692, + "learning_rate": 3.7050900198053096e-05, + "loss": 1.1808, + "step": 3315 + }, + { + "epoch": 1.0214991731087266, + "grad_norm": 1.818331003189087, + "learning_rate": 3.701559597364242e-05, + "loss": 1.1542, + "step": 3320 + }, + { + "epoch": 1.0230375754778662, + "grad_norm": 0.9814029932022095, + "learning_rate": 3.6980260562077694e-05, + "loss": 1.1148, + "step": 3325 + }, + { + "epoch": 1.0245759778470058, + "grad_norm": 0.9357233643531799, + "learning_rate": 3.6944894055073984e-05, + "loss": 1.1327, + "step": 3330 + }, + { + "epoch": 1.0261143802161454, + "grad_norm": 1.1413395404815674, + "learning_rate": 3.690949654442701e-05, + "loss": 1.053, + "step": 3335 + }, + { + "epoch": 1.0276527825852853, + "grad_norm": 0.9855743050575256, + "learning_rate": 3.6874068122013035e-05, + "loss": 1.0974, + "step": 3340 + }, + { + "epoch": 1.0291911849544249, + "grad_norm": 1.7641159296035767, + "learning_rate": 3.6838608879788496e-05, + "loss": 1.2385, + "step": 3345 + }, + { + "epoch": 1.0307295873235645, + "grad_norm": 1.2974658012390137, + "learning_rate": 3.680311890978985e-05, + "loss": 1.2269, + "step": 3350 + }, + { + "epoch": 1.032267989692704, + "grad_norm": 1.1772881746292114, + "learning_rate": 3.6767598304133324e-05, + "loss": 1.2326, + "step": 3355 + }, + { + "epoch": 1.0338063920618439, + "grad_norm": 1.706217646598816, + "learning_rate": 3.673204715501461e-05, + "loss": 1.0957, + "step": 3360 + }, + { + "epoch": 1.0353447944309835, + "grad_norm": 1.6567659378051758, + "learning_rate": 3.669646555470874e-05, + "loss": 1.1362, + "step": 3365 + }, + { + "epoch": 1.036883196800123, + "grad_norm": 2.460679769515991, + "learning_rate": 3.6660853595569754e-05, + "loss": 1.1386, + "step": 3370 + }, + { + "epoch": 1.0384215991692627, + "grad_norm": 1.0992755889892578, + "learning_rate": 3.662521137003048e-05, + "loss": 1.0517, + "step": 3375 + }, + { + "epoch": 1.0399600015384023, + "grad_norm": 1.0326275825500488, + "learning_rate": 3.6589538970602325e-05, + "loss": 1.1088, + "step": 3380 + }, + { + "epoch": 1.0414984039075421, + "grad_norm": 0.9226017594337463, + "learning_rate": 3.6553836489875e-05, + "loss": 1.1799, + "step": 3385 + }, + { + "epoch": 1.0430368062766817, + "grad_norm": 2.348419189453125, + "learning_rate": 3.65181040205163e-05, + "loss": 1.1934, + "step": 3390 + }, + { + "epoch": 1.0445752086458213, + "grad_norm": 0.982110857963562, + "learning_rate": 3.648234165527185e-05, + "loss": 1.1904, + "step": 3395 + }, + { + "epoch": 1.046113611014961, + "grad_norm": 1.326486587524414, + "learning_rate": 3.6446549486964884e-05, + "loss": 1.1015, + "step": 3400 + }, + { + "epoch": 1.0476520133841005, + "grad_norm": 1.5188394784927368, + "learning_rate": 3.641072760849599e-05, + "loss": 1.1911, + "step": 3405 + }, + { + "epoch": 1.0491904157532403, + "grad_norm": 0.9235185384750366, + "learning_rate": 3.637487611284285e-05, + "loss": 1.1657, + "step": 3410 + }, + { + "epoch": 1.05072881812238, + "grad_norm": 1.6619833707809448, + "learning_rate": 3.633899509306004e-05, + "loss": 1.108, + "step": 3415 + }, + { + "epoch": 1.0522672204915196, + "grad_norm": 1.4106361865997314, + "learning_rate": 3.630308464227877e-05, + "loss": 1.2603, + "step": 3420 + }, + { + "epoch": 1.0538056228606592, + "grad_norm": 1.5941157341003418, + "learning_rate": 3.626714485370662e-05, + "loss": 1.2641, + "step": 3425 + }, + { + "epoch": 1.0553440252297988, + "grad_norm": 1.66989004611969, + "learning_rate": 3.6231175820627344e-05, + "loss": 1.2124, + "step": 3430 + }, + { + "epoch": 1.0568824275989386, + "grad_norm": 1.6584402322769165, + "learning_rate": 3.6195177636400565e-05, + "loss": 1.1966, + "step": 3435 + }, + { + "epoch": 1.0584208299680782, + "grad_norm": 1.228973150253296, + "learning_rate": 3.615915039446162e-05, + "loss": 0.9945, + "step": 3440 + }, + { + "epoch": 1.0599592323372178, + "grad_norm": 2.353611946105957, + "learning_rate": 3.6123094188321205e-05, + "loss": 1.1743, + "step": 3445 + }, + { + "epoch": 1.0614976347063574, + "grad_norm": 1.3476722240447998, + "learning_rate": 3.608700911156525e-05, + "loss": 1.0994, + "step": 3450 + }, + { + "epoch": 1.0630360370754972, + "grad_norm": 2.2004075050354004, + "learning_rate": 3.60508952578546e-05, + "loss": 1.0864, + "step": 3455 + }, + { + "epoch": 1.0645744394446368, + "grad_norm": 1.1636754274368286, + "learning_rate": 3.601475272092478e-05, + "loss": 1.1245, + "step": 3460 + }, + { + "epoch": 1.0661128418137764, + "grad_norm": 1.2166703939437866, + "learning_rate": 3.597858159458578e-05, + "loss": 1.1763, + "step": 3465 + }, + { + "epoch": 1.067651244182916, + "grad_norm": 1.0816514492034912, + "learning_rate": 3.594238197272177e-05, + "loss": 1.2429, + "step": 3470 + }, + { + "epoch": 1.0691896465520556, + "grad_norm": 1.839756965637207, + "learning_rate": 3.5906153949290935e-05, + "loss": 1.1356, + "step": 3475 + }, + { + "epoch": 1.0707280489211954, + "grad_norm": 1.1021186113357544, + "learning_rate": 3.5869897618325126e-05, + "loss": 1.2321, + "step": 3480 + }, + { + "epoch": 1.072266451290335, + "grad_norm": 1.5904773473739624, + "learning_rate": 3.5833613073929684e-05, + "loss": 1.1777, + "step": 3485 + }, + { + "epoch": 1.0738048536594746, + "grad_norm": 1.026505947113037, + "learning_rate": 3.579730041028317e-05, + "loss": 1.1767, + "step": 3490 + }, + { + "epoch": 1.0753432560286142, + "grad_norm": 1.023456335067749, + "learning_rate": 3.576095972163718e-05, + "loss": 1.1832, + "step": 3495 + }, + { + "epoch": 1.0768816583977538, + "grad_norm": 0.9071354269981384, + "learning_rate": 3.572459110231598e-05, + "loss": 1.0945, + "step": 3500 + }, + { + "epoch": 1.0784200607668937, + "grad_norm": 1.8704088926315308, + "learning_rate": 3.568819464671637e-05, + "loss": 1.1804, + "step": 3505 + }, + { + "epoch": 1.0799584631360333, + "grad_norm": 2.5177040100097656, + "learning_rate": 3.565177044930739e-05, + "loss": 1.1204, + "step": 3510 + }, + { + "epoch": 1.0814968655051729, + "grad_norm": 2.31343674659729, + "learning_rate": 3.56153186046301e-05, + "loss": 1.0483, + "step": 3515 + }, + { + "epoch": 1.0830352678743125, + "grad_norm": 1.1547645330429077, + "learning_rate": 3.5578839207297306e-05, + "loss": 1.1178, + "step": 3520 + }, + { + "epoch": 1.084573670243452, + "grad_norm": 1.569669246673584, + "learning_rate": 3.5542332351993324e-05, + "loss": 1.1746, + "step": 3525 + }, + { + "epoch": 1.086112072612592, + "grad_norm": 1.2639979124069214, + "learning_rate": 3.550579813347376e-05, + "loss": 1.0747, + "step": 3530 + }, + { + "epoch": 1.0876504749817315, + "grad_norm": 0.9912775754928589, + "learning_rate": 3.546923664656523e-05, + "loss": 1.1877, + "step": 3535 + }, + { + "epoch": 1.089188877350871, + "grad_norm": 1.9763455390930176, + "learning_rate": 3.54326479861651e-05, + "loss": 1.2445, + "step": 3540 + }, + { + "epoch": 1.0907272797200107, + "grad_norm": 0.9376320838928223, + "learning_rate": 3.539603224724133e-05, + "loss": 1.2454, + "step": 3545 + }, + { + "epoch": 1.0922656820891503, + "grad_norm": 1.8034776449203491, + "learning_rate": 3.535938952483211e-05, + "loss": 1.2255, + "step": 3550 + }, + { + "epoch": 1.0938040844582901, + "grad_norm": 1.3023470640182495, + "learning_rate": 3.532271991404568e-05, + "loss": 1.1488, + "step": 3555 + }, + { + "epoch": 1.0953424868274297, + "grad_norm": 1.3908381462097168, + "learning_rate": 3.528602351006006e-05, + "loss": 1.1498, + "step": 3560 + }, + { + "epoch": 1.0968808891965693, + "grad_norm": 2.6319515705108643, + "learning_rate": 3.524930040812286e-05, + "loss": 1.2605, + "step": 3565 + }, + { + "epoch": 1.098419291565709, + "grad_norm": 1.7958158254623413, + "learning_rate": 3.521255070355093e-05, + "loss": 1.1793, + "step": 3570 + }, + { + "epoch": 1.0999576939348485, + "grad_norm": 1.8693649768829346, + "learning_rate": 3.5175774491730204e-05, + "loss": 1.2756, + "step": 3575 + }, + { + "epoch": 1.1014960963039884, + "grad_norm": 4.634489059448242, + "learning_rate": 3.513897186811539e-05, + "loss": 1.1377, + "step": 3580 + }, + { + "epoch": 1.103034498673128, + "grad_norm": 1.2640043497085571, + "learning_rate": 3.510214292822978e-05, + "loss": 1.0172, + "step": 3585 + }, + { + "epoch": 1.1045729010422676, + "grad_norm": 0.9812289476394653, + "learning_rate": 3.506528776766495e-05, + "loss": 1.2052, + "step": 3590 + }, + { + "epoch": 1.1061113034114072, + "grad_norm": 0.8409488797187805, + "learning_rate": 3.5028406482080536e-05, + "loss": 1.2435, + "step": 3595 + }, + { + "epoch": 1.107649705780547, + "grad_norm": 0.9872679710388184, + "learning_rate": 3.499149916720398e-05, + "loss": 1.06, + "step": 3600 + }, + { + "epoch": 1.1091881081496866, + "grad_norm": 2.1323914527893066, + "learning_rate": 3.495456591883031e-05, + "loss": 1.0506, + "step": 3605 + }, + { + "epoch": 1.1107265105188262, + "grad_norm": 1.2901802062988281, + "learning_rate": 3.4917606832821824e-05, + "loss": 1.099, + "step": 3610 + }, + { + "epoch": 1.1122649128879658, + "grad_norm": 1.414628505706787, + "learning_rate": 3.488062200510791e-05, + "loss": 1.1987, + "step": 3615 + }, + { + "epoch": 1.1138033152571054, + "grad_norm": 1.1899245977401733, + "learning_rate": 3.4843611531684786e-05, + "loss": 1.1429, + "step": 3620 + }, + { + "epoch": 1.1153417176262452, + "grad_norm": 0.9524982571601868, + "learning_rate": 3.480657550861518e-05, + "loss": 1.1281, + "step": 3625 + }, + { + "epoch": 1.1168801199953848, + "grad_norm": 1.219228744506836, + "learning_rate": 3.47695140320282e-05, + "loss": 1.1641, + "step": 3630 + }, + { + "epoch": 1.1184185223645244, + "grad_norm": 2.172743320465088, + "learning_rate": 3.473242719811897e-05, + "loss": 1.2067, + "step": 3635 + }, + { + "epoch": 1.119956924733664, + "grad_norm": 1.288135051727295, + "learning_rate": 3.4695315103148454e-05, + "loss": 1.1588, + "step": 3640 + }, + { + "epoch": 1.1214953271028036, + "grad_norm": 1.3018666505813599, + "learning_rate": 3.465817784344318e-05, + "loss": 1.1281, + "step": 3645 + }, + { + "epoch": 1.1230337294719435, + "grad_norm": 1.5916048288345337, + "learning_rate": 3.462101551539499e-05, + "loss": 1.2701, + "step": 3650 + }, + { + "epoch": 1.124572131841083, + "grad_norm": 1.3709497451782227, + "learning_rate": 3.45838282154608e-05, + "loss": 1.223, + "step": 3655 + }, + { + "epoch": 1.1261105342102227, + "grad_norm": 1.505096673965454, + "learning_rate": 3.4546616040162334e-05, + "loss": 1.1068, + "step": 3660 + }, + { + "epoch": 1.1276489365793623, + "grad_norm": 1.1039677858352661, + "learning_rate": 3.450937908608587e-05, + "loss": 1.216, + "step": 3665 + }, + { + "epoch": 1.129187338948502, + "grad_norm": 1.4067137241363525, + "learning_rate": 3.4472117449882025e-05, + "loss": 1.1304, + "step": 3670 + }, + { + "epoch": 1.1307257413176417, + "grad_norm": 0.9000976085662842, + "learning_rate": 3.443483122826547e-05, + "loss": 1.1709, + "step": 3675 + }, + { + "epoch": 1.1322641436867813, + "grad_norm": 1.4269496202468872, + "learning_rate": 3.439752051801467e-05, + "loss": 1.2391, + "step": 3680 + }, + { + "epoch": 1.133802546055921, + "grad_norm": 0.959568977355957, + "learning_rate": 3.436018541597169e-05, + "loss": 1.1445, + "step": 3685 + }, + { + "epoch": 1.1353409484250605, + "grad_norm": 0.9494476318359375, + "learning_rate": 3.4322826019041864e-05, + "loss": 1.1943, + "step": 3690 + }, + { + "epoch": 1.1368793507942003, + "grad_norm": 0.8512333631515503, + "learning_rate": 3.428544242419362e-05, + "loss": 1.2019, + "step": 3695 + }, + { + "epoch": 1.13841775316334, + "grad_norm": 0.9752541780471802, + "learning_rate": 3.4248034728458175e-05, + "loss": 1.1219, + "step": 3700 + }, + { + "epoch": 1.1399561555324795, + "grad_norm": 1.4414304494857788, + "learning_rate": 3.4210603028929295e-05, + "loss": 1.2591, + "step": 3705 + }, + { + "epoch": 1.1414945579016191, + "grad_norm": 1.0669443607330322, + "learning_rate": 3.417314742276308e-05, + "loss": 1.1153, + "step": 3710 + }, + { + "epoch": 1.1430329602707587, + "grad_norm": 1.064900279045105, + "learning_rate": 3.413566800717762e-05, + "loss": 1.2842, + "step": 3715 + }, + { + "epoch": 1.1445713626398986, + "grad_norm": 1.052623987197876, + "learning_rate": 3.409816487945286e-05, + "loss": 1.1178, + "step": 3720 + }, + { + "epoch": 1.1461097650090382, + "grad_norm": 1.8392632007598877, + "learning_rate": 3.4060638136930304e-05, + "loss": 1.1444, + "step": 3725 + }, + { + "epoch": 1.1476481673781778, + "grad_norm": 1.3130204677581787, + "learning_rate": 3.402308787701268e-05, + "loss": 1.1948, + "step": 3730 + }, + { + "epoch": 1.1491865697473174, + "grad_norm": 1.6042733192443848, + "learning_rate": 3.398551419716382e-05, + "loss": 1.096, + "step": 3735 + }, + { + "epoch": 1.150724972116457, + "grad_norm": 1.7846676111221313, + "learning_rate": 3.3947917194908306e-05, + "loss": 1.2706, + "step": 3740 + }, + { + "epoch": 1.1522633744855968, + "grad_norm": 1.7049797773361206, + "learning_rate": 3.3910296967831266e-05, + "loss": 1.1966, + "step": 3745 + }, + { + "epoch": 1.1538017768547364, + "grad_norm": 0.9584757685661316, + "learning_rate": 3.3872653613578134e-05, + "loss": 1.1628, + "step": 3750 + }, + { + "epoch": 1.155340179223876, + "grad_norm": 1.128525972366333, + "learning_rate": 3.383498722985432e-05, + "loss": 1.163, + "step": 3755 + }, + { + "epoch": 1.1568785815930156, + "grad_norm": 1.1508246660232544, + "learning_rate": 3.379729791442506e-05, + "loss": 1.1373, + "step": 3760 + }, + { + "epoch": 1.1584169839621552, + "grad_norm": 0.9009411334991455, + "learning_rate": 3.375958576511508e-05, + "loss": 1.1696, + "step": 3765 + }, + { + "epoch": 1.159955386331295, + "grad_norm": 1.28462553024292, + "learning_rate": 3.372185087980838e-05, + "loss": 1.0845, + "step": 3770 + }, + { + "epoch": 1.1614937887004346, + "grad_norm": 1.2223782539367676, + "learning_rate": 3.368409335644798e-05, + "loss": 1.1714, + "step": 3775 + }, + { + "epoch": 1.1630321910695742, + "grad_norm": 1.155945062637329, + "learning_rate": 3.364631329303564e-05, + "loss": 1.1125, + "step": 3780 + }, + { + "epoch": 1.1645705934387138, + "grad_norm": 2.37724232673645, + "learning_rate": 3.3608510787631654e-05, + "loss": 1.2847, + "step": 3785 + }, + { + "epoch": 1.1661089958078534, + "grad_norm": 1.2505033016204834, + "learning_rate": 3.357068593835453e-05, + "loss": 1.1486, + "step": 3790 + }, + { + "epoch": 1.1676473981769933, + "grad_norm": 1.1727055311203003, + "learning_rate": 3.35328388433808e-05, + "loss": 1.1909, + "step": 3795 + }, + { + "epoch": 1.1691858005461329, + "grad_norm": 2.013213872909546, + "learning_rate": 3.3494969600944715e-05, + "loss": 1.1553, + "step": 3800 + }, + { + "epoch": 1.1707242029152725, + "grad_norm": 1.5769529342651367, + "learning_rate": 3.345707830933803e-05, + "loss": 1.2083, + "step": 3805 + }, + { + "epoch": 1.172262605284412, + "grad_norm": 1.1164283752441406, + "learning_rate": 3.3419165066909705e-05, + "loss": 1.1371, + "step": 3810 + }, + { + "epoch": 1.1738010076535517, + "grad_norm": 1.1431792974472046, + "learning_rate": 3.338122997206571e-05, + "loss": 1.1377, + "step": 3815 + }, + { + "epoch": 1.1753394100226915, + "grad_norm": 1.0517457723617554, + "learning_rate": 3.33432731232687e-05, + "loss": 1.1152, + "step": 3820 + }, + { + "epoch": 1.176877812391831, + "grad_norm": 1.159220814704895, + "learning_rate": 3.3305294619037805e-05, + "loss": 1.1877, + "step": 3825 + }, + { + "epoch": 1.1784162147609707, + "grad_norm": 1.6318359375, + "learning_rate": 3.326729455794838e-05, + "loss": 1.1014, + "step": 3830 + }, + { + "epoch": 1.1799546171301103, + "grad_norm": 1.4239236116409302, + "learning_rate": 3.322927303863171e-05, + "loss": 1.2185, + "step": 3835 + }, + { + "epoch": 1.1814930194992501, + "grad_norm": 2.194650650024414, + "learning_rate": 3.319123015977478e-05, + "loss": 1.1972, + "step": 3840 + }, + { + "epoch": 1.1830314218683897, + "grad_norm": 0.9801701307296753, + "learning_rate": 3.315316602012001e-05, + "loss": 1.1984, + "step": 3845 + }, + { + "epoch": 1.1845698242375293, + "grad_norm": 1.7991083860397339, + "learning_rate": 3.311508071846504e-05, + "loss": 1.1742, + "step": 3850 + }, + { + "epoch": 1.186108226606669, + "grad_norm": 2.0683629512786865, + "learning_rate": 3.307697435366237e-05, + "loss": 1.1949, + "step": 3855 + }, + { + "epoch": 1.1876466289758085, + "grad_norm": 1.1672157049179077, + "learning_rate": 3.303884702461924e-05, + "loss": 1.1302, + "step": 3860 + }, + { + "epoch": 1.1891850313449484, + "grad_norm": 2.2780561447143555, + "learning_rate": 3.300069883029727e-05, + "loss": 1.1606, + "step": 3865 + }, + { + "epoch": 1.190723433714088, + "grad_norm": 0.9155642986297607, + "learning_rate": 3.296252986971222e-05, + "loss": 1.2142, + "step": 3870 + }, + { + "epoch": 1.1922618360832276, + "grad_norm": 0.7124369740486145, + "learning_rate": 3.29243402419338e-05, + "loss": 1.2005, + "step": 3875 + }, + { + "epoch": 1.1938002384523672, + "grad_norm": 0.9560266137123108, + "learning_rate": 3.2886130046085306e-05, + "loss": 1.1488, + "step": 3880 + }, + { + "epoch": 1.195338640821507, + "grad_norm": 2.2523956298828125, + "learning_rate": 3.284789938134346e-05, + "loss": 1.1306, + "step": 3885 + }, + { + "epoch": 1.1968770431906466, + "grad_norm": 1.153905987739563, + "learning_rate": 3.2809648346938105e-05, + "loss": 1.2155, + "step": 3890 + }, + { + "epoch": 1.1984154455597862, + "grad_norm": 1.6172610521316528, + "learning_rate": 3.2771377042151944e-05, + "loss": 1.1306, + "step": 3895 + }, + { + "epoch": 1.1999538479289258, + "grad_norm": 1.023131012916565, + "learning_rate": 3.2733085566320285e-05, + "loss": 1.1581, + "step": 3900 + }, + { + "epoch": 1.2014922502980654, + "grad_norm": 1.280826449394226, + "learning_rate": 3.2694774018830816e-05, + "loss": 1.238, + "step": 3905 + }, + { + "epoch": 1.2030306526672052, + "grad_norm": 0.9869760870933533, + "learning_rate": 3.265644249912331e-05, + "loss": 1.2139, + "step": 3910 + }, + { + "epoch": 1.2045690550363448, + "grad_norm": 2.917651414871216, + "learning_rate": 3.261809110668937e-05, + "loss": 0.9819, + "step": 3915 + }, + { + "epoch": 1.2061074574054844, + "grad_norm": 0.8974500298500061, + "learning_rate": 3.25797199410722e-05, + "loss": 1.144, + "step": 3920 + }, + { + "epoch": 1.207645859774624, + "grad_norm": 1.1364940404891968, + "learning_rate": 3.254132910186631e-05, + "loss": 1.2498, + "step": 3925 + }, + { + "epoch": 1.2091842621437636, + "grad_norm": 1.5066908597946167, + "learning_rate": 3.2502918688717285e-05, + "loss": 1.1648, + "step": 3930 + }, + { + "epoch": 1.2107226645129034, + "grad_norm": 1.5828046798706055, + "learning_rate": 3.2464488801321494e-05, + "loss": 1.2591, + "step": 3935 + }, + { + "epoch": 1.212261066882043, + "grad_norm": 1.869602918624878, + "learning_rate": 3.2426039539425876e-05, + "loss": 1.2592, + "step": 3940 + }, + { + "epoch": 1.2137994692511826, + "grad_norm": 0.9975070357322693, + "learning_rate": 3.2387571002827656e-05, + "loss": 1.2349, + "step": 3945 + }, + { + "epoch": 1.2153378716203223, + "grad_norm": 2.0380489826202393, + "learning_rate": 3.234908329137406e-05, + "loss": 1.1056, + "step": 3950 + }, + { + "epoch": 1.2168762739894619, + "grad_norm": 1.3352590799331665, + "learning_rate": 3.231057650496214e-05, + "loss": 1.1998, + "step": 3955 + }, + { + "epoch": 1.2184146763586017, + "grad_norm": 1.1479742527008057, + "learning_rate": 3.2272050743538385e-05, + "loss": 1.1915, + "step": 3960 + }, + { + "epoch": 1.2199530787277413, + "grad_norm": 1.3033512830734253, + "learning_rate": 3.22335061070986e-05, + "loss": 1.1046, + "step": 3965 + }, + { + "epoch": 1.2214914810968809, + "grad_norm": 1.171144723892212, + "learning_rate": 3.219494269568753e-05, + "loss": 1.1206, + "step": 3970 + }, + { + "epoch": 1.2230298834660205, + "grad_norm": 1.440745234489441, + "learning_rate": 3.215636060939869e-05, + "loss": 1.112, + "step": 3975 + }, + { + "epoch": 1.22456828583516, + "grad_norm": 1.118881344795227, + "learning_rate": 3.211775994837405e-05, + "loss": 1.166, + "step": 3980 + }, + { + "epoch": 1.2261066882043, + "grad_norm": 1.8601933717727661, + "learning_rate": 3.207914081280379e-05, + "loss": 1.2695, + "step": 3985 + }, + { + "epoch": 1.2276450905734395, + "grad_norm": 1.5979291200637817, + "learning_rate": 3.204050330292604e-05, + "loss": 1.1158, + "step": 3990 + }, + { + "epoch": 1.2291834929425791, + "grad_norm": 1.368026614189148, + "learning_rate": 3.2001847519026646e-05, + "loss": 1.1164, + "step": 3995 + }, + { + "epoch": 1.2307218953117187, + "grad_norm": 1.9506460428237915, + "learning_rate": 3.196317356143884e-05, + "loss": 1.1607, + "step": 4000 + }, + { + "epoch": 1.2322602976808583, + "grad_norm": 1.1299954652786255, + "learning_rate": 3.192448153054306e-05, + "loss": 1.1518, + "step": 4005 + }, + { + "epoch": 1.2337987000499981, + "grad_norm": 1.389312982559204, + "learning_rate": 3.188577152676666e-05, + "loss": 1.0802, + "step": 4010 + }, + { + "epoch": 1.2353371024191377, + "grad_norm": 1.1280195713043213, + "learning_rate": 3.1847043650583604e-05, + "loss": 1.1637, + "step": 4015 + }, + { + "epoch": 1.2368755047882773, + "grad_norm": 0.959011971950531, + "learning_rate": 3.180829800251428e-05, + "loss": 1.1652, + "step": 4020 + }, + { + "epoch": 1.238413907157417, + "grad_norm": 1.3471107482910156, + "learning_rate": 3.1769534683125195e-05, + "loss": 1.1713, + "step": 4025 + }, + { + "epoch": 1.2399523095265566, + "grad_norm": 1.7551147937774658, + "learning_rate": 3.1730753793028724e-05, + "loss": 1.1362, + "step": 4030 + }, + { + "epoch": 1.2414907118956964, + "grad_norm": 1.71892249584198, + "learning_rate": 3.169195543288283e-05, + "loss": 1.1525, + "step": 4035 + }, + { + "epoch": 1.243029114264836, + "grad_norm": 1.1600077152252197, + "learning_rate": 3.165313970339087e-05, + "loss": 1.1085, + "step": 4040 + }, + { + "epoch": 1.2445675166339756, + "grad_norm": 0.9502371549606323, + "learning_rate": 3.1614306705301204e-05, + "loss": 1.2047, + "step": 4045 + }, + { + "epoch": 1.2461059190031152, + "grad_norm": 1.5982012748718262, + "learning_rate": 3.15754565394071e-05, + "loss": 1.167, + "step": 4050 + }, + { + "epoch": 1.247644321372255, + "grad_norm": 1.2877519130706787, + "learning_rate": 3.153658930654631e-05, + "loss": 1.0551, + "step": 4055 + }, + { + "epoch": 1.2491827237413946, + "grad_norm": 0.9973001480102539, + "learning_rate": 3.149770510760093e-05, + "loss": 1.0985, + "step": 4060 + }, + { + "epoch": 1.2507211261105342, + "grad_norm": 2.4550881385803223, + "learning_rate": 3.14588040434971e-05, + "loss": 1.2302, + "step": 4065 + }, + { + "epoch": 1.2522595284796738, + "grad_norm": 1.004104733467102, + "learning_rate": 3.1419886215204694e-05, + "loss": 1.1658, + "step": 4070 + }, + { + "epoch": 1.2537979308488136, + "grad_norm": 2.3353803157806396, + "learning_rate": 3.138095172373714e-05, + "loss": 1.1095, + "step": 4075 + }, + { + "epoch": 1.2553363332179532, + "grad_norm": 3.044813394546509, + "learning_rate": 3.134200067015108e-05, + "loss": 1.2342, + "step": 4080 + }, + { + "epoch": 1.2568747355870928, + "grad_norm": 1.2953006029129028, + "learning_rate": 3.130303315554617e-05, + "loss": 1.0888, + "step": 4085 + }, + { + "epoch": 1.2584131379562324, + "grad_norm": 1.468589425086975, + "learning_rate": 3.1264049281064775e-05, + "loss": 1.0256, + "step": 4090 + }, + { + "epoch": 1.259951540325372, + "grad_norm": 1.058424949645996, + "learning_rate": 3.1225049147891737e-05, + "loss": 1.1953, + "step": 4095 + }, + { + "epoch": 1.2614899426945119, + "grad_norm": 1.2426944971084595, + "learning_rate": 3.118603285725409e-05, + "loss": 1.2162, + "step": 4100 + }, + { + "epoch": 1.2630283450636515, + "grad_norm": 1.318311095237732, + "learning_rate": 3.114700051042081e-05, + "loss": 1.0836, + "step": 4105 + }, + { + "epoch": 1.264566747432791, + "grad_norm": 1.1298370361328125, + "learning_rate": 3.110795220870253e-05, + "loss": 1.1453, + "step": 4110 + }, + { + "epoch": 1.2661051498019307, + "grad_norm": 1.2516565322875977, + "learning_rate": 3.1068888053451334e-05, + "loss": 1.1972, + "step": 4115 + }, + { + "epoch": 1.2676435521710703, + "grad_norm": 1.364283800125122, + "learning_rate": 3.102980814606042e-05, + "loss": 1.0966, + "step": 4120 + }, + { + "epoch": 1.26918195454021, + "grad_norm": 1.593657374382019, + "learning_rate": 3.099071258796387e-05, + "loss": 1.1734, + "step": 4125 + }, + { + "epoch": 1.2707203569093497, + "grad_norm": 1.190921425819397, + "learning_rate": 3.0951601480636404e-05, + "loss": 1.2381, + "step": 4130 + }, + { + "epoch": 1.2722587592784893, + "grad_norm": 1.0939090251922607, + "learning_rate": 3.091247492559312e-05, + "loss": 1.0485, + "step": 4135 + }, + { + "epoch": 1.273797161647629, + "grad_norm": 1.5152145624160767, + "learning_rate": 3.087333302438916e-05, + "loss": 1.0804, + "step": 4140 + }, + { + "epoch": 1.2753355640167685, + "grad_norm": 1.4711779356002808, + "learning_rate": 3.0834175878619546e-05, + "loss": 1.0904, + "step": 4145 + }, + { + "epoch": 1.2768739663859083, + "grad_norm": 1.8078420162200928, + "learning_rate": 3.0795003589918834e-05, + "loss": 1.1033, + "step": 4150 + }, + { + "epoch": 1.278412368755048, + "grad_norm": 1.389979600906372, + "learning_rate": 3.0755816259960915e-05, + "loss": 1.0441, + "step": 4155 + }, + { + "epoch": 1.2799507711241875, + "grad_norm": 1.3937149047851562, + "learning_rate": 3.071661399045869e-05, + "loss": 1.235, + "step": 4160 + }, + { + "epoch": 1.2814891734933271, + "grad_norm": 1.5922768115997314, + "learning_rate": 3.067739688316385e-05, + "loss": 1.1928, + "step": 4165 + }, + { + "epoch": 1.2830275758624667, + "grad_norm": 1.0673027038574219, + "learning_rate": 3.0638165039866614e-05, + "loss": 1.0306, + "step": 4170 + }, + { + "epoch": 1.2845659782316066, + "grad_norm": 1.3722416162490845, + "learning_rate": 3.0598918562395425e-05, + "loss": 1.0907, + "step": 4175 + }, + { + "epoch": 1.2861043806007462, + "grad_norm": 0.9146732687950134, + "learning_rate": 3.0559657552616716e-05, + "loss": 1.1372, + "step": 4180 + }, + { + "epoch": 1.2876427829698858, + "grad_norm": 1.4379206895828247, + "learning_rate": 3.0520382112434636e-05, + "loss": 1.1397, + "step": 4185 + }, + { + "epoch": 1.2891811853390254, + "grad_norm": 1.0497307777404785, + "learning_rate": 3.0481092343790822e-05, + "loss": 1.1, + "step": 4190 + }, + { + "epoch": 1.290719587708165, + "grad_norm": 0.9752093553543091, + "learning_rate": 3.044178834866405e-05, + "loss": 1.1479, + "step": 4195 + }, + { + "epoch": 1.2922579900773048, + "grad_norm": 1.132630467414856, + "learning_rate": 3.0402470229070056e-05, + "loss": 1.1148, + "step": 4200 + }, + { + "epoch": 1.2937963924464444, + "grad_norm": 1.2013700008392334, + "learning_rate": 3.0363138087061222e-05, + "loss": 1.0205, + "step": 4205 + }, + { + "epoch": 1.295334794815584, + "grad_norm": 1.7345274686813354, + "learning_rate": 3.032379202472634e-05, + "loss": 1.1862, + "step": 4210 + }, + { + "epoch": 1.2968731971847236, + "grad_norm": 1.4804950952529907, + "learning_rate": 3.0284432144190315e-05, + "loss": 1.0884, + "step": 4215 + }, + { + "epoch": 1.2984115995538632, + "grad_norm": 1.7368816137313843, + "learning_rate": 3.0245058547613924e-05, + "loss": 1.2023, + "step": 4220 + }, + { + "epoch": 1.299950001923003, + "grad_norm": 1.1152502298355103, + "learning_rate": 3.0205671337193566e-05, + "loss": 1.1609, + "step": 4225 + }, + { + "epoch": 1.3014884042921426, + "grad_norm": 0.8946184515953064, + "learning_rate": 3.016627061516094e-05, + "loss": 1.0451, + "step": 4230 + }, + { + "epoch": 1.3030268066612822, + "grad_norm": 0.9706270098686218, + "learning_rate": 3.0126856483782838e-05, + "loss": 1.1886, + "step": 4235 + }, + { + "epoch": 1.3045652090304218, + "grad_norm": 2.113147020339966, + "learning_rate": 3.008742904536085e-05, + "loss": 1.1681, + "step": 4240 + }, + { + "epoch": 1.3061036113995614, + "grad_norm": 1.2672151327133179, + "learning_rate": 3.0047988402231113e-05, + "loss": 1.1769, + "step": 4245 + }, + { + "epoch": 1.3076420137687013, + "grad_norm": 0.8190154433250427, + "learning_rate": 3.000853465676402e-05, + "loss": 1.0251, + "step": 4250 + }, + { + "epoch": 1.3091804161378409, + "grad_norm": 1.9547206163406372, + "learning_rate": 2.9969067911363992e-05, + "loss": 1.2783, + "step": 4255 + }, + { + "epoch": 1.3107188185069805, + "grad_norm": 1.1879409551620483, + "learning_rate": 2.992958826846918e-05, + "loss": 1.0925, + "step": 4260 + }, + { + "epoch": 1.31225722087612, + "grad_norm": 1.0043494701385498, + "learning_rate": 2.9890095830551207e-05, + "loss": 1.135, + "step": 4265 + }, + { + "epoch": 1.3137956232452597, + "grad_norm": 1.3702094554901123, + "learning_rate": 2.985059070011492e-05, + "loss": 1.1819, + "step": 4270 + }, + { + "epoch": 1.3153340256143995, + "grad_norm": 1.28737473487854, + "learning_rate": 2.9811072979698107e-05, + "loss": 1.146, + "step": 4275 + }, + { + "epoch": 1.316872427983539, + "grad_norm": 1.6883684396743774, + "learning_rate": 2.9771542771871224e-05, + "loss": 1.0705, + "step": 4280 + }, + { + "epoch": 1.3184108303526787, + "grad_norm": 0.9764033555984497, + "learning_rate": 2.973200017923715e-05, + "loss": 1.1656, + "step": 4285 + }, + { + "epoch": 1.3199492327218185, + "grad_norm": 1.5175962448120117, + "learning_rate": 2.96924453044309e-05, + "loss": 1.1809, + "step": 4290 + }, + { + "epoch": 1.321487635090958, + "grad_norm": 1.7922492027282715, + "learning_rate": 2.9652878250119375e-05, + "loss": 1.0016, + "step": 4295 + }, + { + "epoch": 1.3230260374600977, + "grad_norm": 1.114981770515442, + "learning_rate": 2.9613299119001082e-05, + "loss": 1.2292, + "step": 4300 + }, + { + "epoch": 1.3245644398292373, + "grad_norm": 1.1757748126983643, + "learning_rate": 2.9573708013805885e-05, + "loss": 1.1243, + "step": 4305 + }, + { + "epoch": 1.326102842198377, + "grad_norm": 1.034613847732544, + "learning_rate": 2.953410503729471e-05, + "loss": 1.2089, + "step": 4310 + }, + { + "epoch": 1.3276412445675168, + "grad_norm": 1.4121217727661133, + "learning_rate": 2.9494490292259326e-05, + "loss": 1.0332, + "step": 4315 + }, + { + "epoch": 1.3291796469366564, + "grad_norm": 1.6308557987213135, + "learning_rate": 2.945486388152201e-05, + "loss": 1.1536, + "step": 4320 + }, + { + "epoch": 1.330718049305796, + "grad_norm": 1.5427446365356445, + "learning_rate": 2.941522590793534e-05, + "loss": 1.2825, + "step": 4325 + }, + { + "epoch": 1.3322564516749356, + "grad_norm": 2.0228891372680664, + "learning_rate": 2.9375576474381905e-05, + "loss": 1.0036, + "step": 4330 + }, + { + "epoch": 1.3337948540440752, + "grad_norm": 1.04116952419281, + "learning_rate": 2.9335915683774034e-05, + "loss": 1.0644, + "step": 4335 + }, + { + "epoch": 1.335333256413215, + "grad_norm": 2.5048625469207764, + "learning_rate": 2.9296243639053545e-05, + "loss": 1.0763, + "step": 4340 + }, + { + "epoch": 1.3368716587823546, + "grad_norm": 1.2487713098526, + "learning_rate": 2.9256560443191434e-05, + "loss": 1.2327, + "step": 4345 + }, + { + "epoch": 1.3384100611514942, + "grad_norm": 1.1335622072219849, + "learning_rate": 2.9216866199187697e-05, + "loss": 1.1097, + "step": 4350 + }, + { + "epoch": 1.3399484635206338, + "grad_norm": 1.3442351818084717, + "learning_rate": 2.9177161010070946e-05, + "loss": 1.1278, + "step": 4355 + }, + { + "epoch": 1.3414868658897734, + "grad_norm": 1.2338272333145142, + "learning_rate": 2.9137444978898244e-05, + "loss": 1.0971, + "step": 4360 + }, + { + "epoch": 1.3430252682589132, + "grad_norm": 1.0198732614517212, + "learning_rate": 2.9097718208754777e-05, + "loss": 1.1223, + "step": 4365 + }, + { + "epoch": 1.3445636706280528, + "grad_norm": 1.195879578590393, + "learning_rate": 2.90579808027536e-05, + "loss": 1.1469, + "step": 4370 + }, + { + "epoch": 1.3461020729971924, + "grad_norm": 1.2800102233886719, + "learning_rate": 2.901823286403539e-05, + "loss": 1.1358, + "step": 4375 + }, + { + "epoch": 1.347640475366332, + "grad_norm": 1.07575261592865, + "learning_rate": 2.897847449576815e-05, + "loss": 1.1122, + "step": 4380 + }, + { + "epoch": 1.3491788777354716, + "grad_norm": 0.9419439435005188, + "learning_rate": 2.8938705801146958e-05, + "loss": 1.1214, + "step": 4385 + }, + { + "epoch": 1.3507172801046115, + "grad_norm": 1.0005466938018799, + "learning_rate": 2.88989268833937e-05, + "loss": 1.2561, + "step": 4390 + }, + { + "epoch": 1.352255682473751, + "grad_norm": 0.9994781613349915, + "learning_rate": 2.8859137845756784e-05, + "loss": 1.125, + "step": 4395 + }, + { + "epoch": 1.3537940848428907, + "grad_norm": 1.0525974035263062, + "learning_rate": 2.8819338791510887e-05, + "loss": 1.1561, + "step": 4400 + }, + { + "epoch": 1.3553324872120303, + "grad_norm": 1.1305088996887207, + "learning_rate": 2.8779529823956704e-05, + "loss": 1.129, + "step": 4405 + }, + { + "epoch": 1.3568708895811699, + "grad_norm": 1.073445439338684, + "learning_rate": 2.8739711046420626e-05, + "loss": 1.1403, + "step": 4410 + }, + { + "epoch": 1.3584092919503097, + "grad_norm": 1.792883038520813, + "learning_rate": 2.8699882562254538e-05, + "loss": 1.0386, + "step": 4415 + }, + { + "epoch": 1.3599476943194493, + "grad_norm": 1.0318684577941895, + "learning_rate": 2.8660044474835514e-05, + "loss": 1.2134, + "step": 4420 + }, + { + "epoch": 1.3614860966885889, + "grad_norm": 0.9824214577674866, + "learning_rate": 2.862019688756553e-05, + "loss": 1.1874, + "step": 4425 + }, + { + "epoch": 1.3630244990577285, + "grad_norm": 3.1488852500915527, + "learning_rate": 2.858033990387125e-05, + "loss": 1.1967, + "step": 4430 + }, + { + "epoch": 1.364562901426868, + "grad_norm": 2.4233949184417725, + "learning_rate": 2.8540473627203708e-05, + "loss": 1.0966, + "step": 4435 + }, + { + "epoch": 1.366101303796008, + "grad_norm": 1.065972924232483, + "learning_rate": 2.8500598161038057e-05, + "loss": 1.0942, + "step": 4440 + }, + { + "epoch": 1.3676397061651475, + "grad_norm": 1.105650782585144, + "learning_rate": 2.8460713608873323e-05, + "loss": 1.2002, + "step": 4445 + }, + { + "epoch": 1.3691781085342871, + "grad_norm": 2.50470232963562, + "learning_rate": 2.8420820074232086e-05, + "loss": 1.0063, + "step": 4450 + }, + { + "epoch": 1.3707165109034267, + "grad_norm": 1.2763255834579468, + "learning_rate": 2.8380917660660262e-05, + "loss": 1.1806, + "step": 4455 + }, + { + "epoch": 1.3722549132725663, + "grad_norm": 1.4548883438110352, + "learning_rate": 2.8341006471726816e-05, + "loss": 1.091, + "step": 4460 + }, + { + "epoch": 1.3737933156417061, + "grad_norm": 4.325429916381836, + "learning_rate": 2.830108661102346e-05, + "loss": 1.1485, + "step": 4465 + }, + { + "epoch": 1.3753317180108457, + "grad_norm": 1.4059125185012817, + "learning_rate": 2.826115818216444e-05, + "loss": 1.109, + "step": 4470 + }, + { + "epoch": 1.3768701203799854, + "grad_norm": 1.088011384010315, + "learning_rate": 2.822122128878625e-05, + "loss": 1.1762, + "step": 4475 + }, + { + "epoch": 1.378408522749125, + "grad_norm": 0.8821271061897278, + "learning_rate": 2.818127603454732e-05, + "loss": 1.0974, + "step": 4480 + }, + { + "epoch": 1.3799469251182646, + "grad_norm": 1.2587283849716187, + "learning_rate": 2.8141322523127817e-05, + "loss": 1.0408, + "step": 4485 + }, + { + "epoch": 1.3814853274874044, + "grad_norm": 1.244844675064087, + "learning_rate": 2.810136085822931e-05, + "loss": 1.1508, + "step": 4490 + }, + { + "epoch": 1.383023729856544, + "grad_norm": 1.0243417024612427, + "learning_rate": 2.8061391143574545e-05, + "loss": 1.14, + "step": 4495 + }, + { + "epoch": 1.3845621322256836, + "grad_norm": 1.2699311971664429, + "learning_rate": 2.8021413482907176e-05, + "loss": 1.264, + "step": 4500 + }, + { + "epoch": 1.3861005345948234, + "grad_norm": 1.102189064025879, + "learning_rate": 2.798142797999144e-05, + "loss": 1.0759, + "step": 4505 + }, + { + "epoch": 1.3876389369639628, + "grad_norm": 1.9496086835861206, + "learning_rate": 2.794143473861198e-05, + "loss": 1.0819, + "step": 4510 + }, + { + "epoch": 1.3891773393331026, + "grad_norm": 0.8148252367973328, + "learning_rate": 2.7901433862573495e-05, + "loss": 1.1767, + "step": 4515 + }, + { + "epoch": 1.3907157417022422, + "grad_norm": 1.303236961364746, + "learning_rate": 2.786142545570049e-05, + "loss": 1.2153, + "step": 4520 + }, + { + "epoch": 1.3922541440713818, + "grad_norm": 2.4543964862823486, + "learning_rate": 2.782140962183704e-05, + "loss": 0.9841, + "step": 4525 + }, + { + "epoch": 1.3937925464405216, + "grad_norm": 1.1783264875411987, + "learning_rate": 2.7781386464846497e-05, + "loss": 1.1419, + "step": 4530 + }, + { + "epoch": 1.3953309488096612, + "grad_norm": 1.238843321800232, + "learning_rate": 2.7741356088611205e-05, + "loss": 1.0705, + "step": 4535 + }, + { + "epoch": 1.3968693511788008, + "grad_norm": 1.183280348777771, + "learning_rate": 2.7701318597032248e-05, + "loss": 1.2047, + "step": 4540 + }, + { + "epoch": 1.3984077535479404, + "grad_norm": 1.5718815326690674, + "learning_rate": 2.7661274094029193e-05, + "loss": 1.2228, + "step": 4545 + }, + { + "epoch": 1.39994615591708, + "grad_norm": 1.5735619068145752, + "learning_rate": 2.7621222683539792e-05, + "loss": 1.2255, + "step": 4550 + }, + { + "epoch": 1.4014845582862199, + "grad_norm": 1.1554558277130127, + "learning_rate": 2.7581164469519732e-05, + "loss": 1.1794, + "step": 4555 + }, + { + "epoch": 1.4030229606553595, + "grad_norm": 1.498023509979248, + "learning_rate": 2.754109955594235e-05, + "loss": 1.1865, + "step": 4560 + }, + { + "epoch": 1.404561363024499, + "grad_norm": 1.2338696718215942, + "learning_rate": 2.7501028046798387e-05, + "loss": 1.0968, + "step": 4565 + }, + { + "epoch": 1.4060997653936387, + "grad_norm": 1.8019814491271973, + "learning_rate": 2.7460950046095696e-05, + "loss": 1.1228, + "step": 4570 + }, + { + "epoch": 1.4076381677627783, + "grad_norm": 1.3556607961654663, + "learning_rate": 2.742086565785896e-05, + "loss": 1.1827, + "step": 4575 + }, + { + "epoch": 1.409176570131918, + "grad_norm": 1.5109410285949707, + "learning_rate": 2.738077498612949e-05, + "loss": 1.1437, + "step": 4580 + }, + { + "epoch": 1.4107149725010577, + "grad_norm": 1.0716326236724854, + "learning_rate": 2.7340678134964855e-05, + "loss": 1.2372, + "step": 4585 + }, + { + "epoch": 1.4122533748701973, + "grad_norm": 1.2416647672653198, + "learning_rate": 2.7300575208438683e-05, + "loss": 1.1694, + "step": 4590 + }, + { + "epoch": 1.413791777239337, + "grad_norm": 1.2061866521835327, + "learning_rate": 2.7260466310640377e-05, + "loss": 1.2479, + "step": 4595 + }, + { + "epoch": 1.4153301796084765, + "grad_norm": 1.7378590106964111, + "learning_rate": 2.7220351545674834e-05, + "loss": 1.1552, + "step": 4600 + }, + { + "epoch": 1.4168685819776163, + "grad_norm": 1.3554580211639404, + "learning_rate": 2.7180231017662178e-05, + "loss": 1.1731, + "step": 4605 + }, + { + "epoch": 1.418406984346756, + "grad_norm": 1.5967758893966675, + "learning_rate": 2.7140104830737496e-05, + "loss": 1.2213, + "step": 4610 + }, + { + "epoch": 1.4199453867158955, + "grad_norm": 1.3568000793457031, + "learning_rate": 2.709997308905055e-05, + "loss": 1.1097, + "step": 4615 + }, + { + "epoch": 1.4214837890850351, + "grad_norm": 1.3305199146270752, + "learning_rate": 2.705983589676554e-05, + "loss": 1.0566, + "step": 4620 + }, + { + "epoch": 1.4230221914541747, + "grad_norm": 1.3310613632202148, + "learning_rate": 2.7019693358060792e-05, + "loss": 1.1266, + "step": 4625 + }, + { + "epoch": 1.4245605938233146, + "grad_norm": 1.6512633562088013, + "learning_rate": 2.6979545577128522e-05, + "loss": 1.1205, + "step": 4630 + }, + { + "epoch": 1.4260989961924542, + "grad_norm": 0.985725462436676, + "learning_rate": 2.6939392658174568e-05, + "loss": 1.1683, + "step": 4635 + }, + { + "epoch": 1.4276373985615938, + "grad_norm": 1.284145712852478, + "learning_rate": 2.6899234705418052e-05, + "loss": 1.1016, + "step": 4640 + }, + { + "epoch": 1.4291758009307334, + "grad_norm": 1.1945017576217651, + "learning_rate": 2.685907182309122e-05, + "loss": 1.1832, + "step": 4645 + }, + { + "epoch": 1.430714203299873, + "grad_norm": 1.2088850736618042, + "learning_rate": 2.681890411543908e-05, + "loss": 1.1577, + "step": 4650 + }, + { + "epoch": 1.4322526056690128, + "grad_norm": 2.208871841430664, + "learning_rate": 2.6778731686719178e-05, + "loss": 1.2002, + "step": 4655 + }, + { + "epoch": 1.4337910080381524, + "grad_norm": 0.9800326824188232, + "learning_rate": 2.6738554641201298e-05, + "loss": 1.1888, + "step": 4660 + }, + { + "epoch": 1.435329410407292, + "grad_norm": 5.183529853820801, + "learning_rate": 2.669837308316723e-05, + "loss": 1.0284, + "step": 4665 + }, + { + "epoch": 1.4368678127764316, + "grad_norm": 1.1594517230987549, + "learning_rate": 2.6658187116910455e-05, + "loss": 1.1301, + "step": 4670 + }, + { + "epoch": 1.4384062151455712, + "grad_norm": 1.1288197040557861, + "learning_rate": 2.6617996846735904e-05, + "loss": 1.1265, + "step": 4675 + }, + { + "epoch": 1.439944617514711, + "grad_norm": 2.477808952331543, + "learning_rate": 2.6577802376959698e-05, + "loss": 1.1718, + "step": 4680 + }, + { + "epoch": 1.4414830198838506, + "grad_norm": 1.1020774841308594, + "learning_rate": 2.653760381190881e-05, + "loss": 1.1919, + "step": 4685 + }, + { + "epoch": 1.4430214222529902, + "grad_norm": 1.5409555435180664, + "learning_rate": 2.64974012559209e-05, + "loss": 1.1858, + "step": 4690 + }, + { + "epoch": 1.4445598246221298, + "grad_norm": 1.6623104810714722, + "learning_rate": 2.6457194813343948e-05, + "loss": 1.2185, + "step": 4695 + }, + { + "epoch": 1.4460982269912694, + "grad_norm": 1.2908927202224731, + "learning_rate": 2.641698458853603e-05, + "loss": 1.0986, + "step": 4700 + }, + { + "epoch": 1.4476366293604093, + "grad_norm": 1.120782732963562, + "learning_rate": 2.637677068586505e-05, + "loss": 1.2856, + "step": 4705 + }, + { + "epoch": 1.4491750317295489, + "grad_norm": 0.8865787386894226, + "learning_rate": 2.6336553209708447e-05, + "loss": 1.2691, + "step": 4710 + }, + { + "epoch": 1.4507134340986885, + "grad_norm": 1.8774168491363525, + "learning_rate": 2.6296332264452934e-05, + "loss": 1.1015, + "step": 4715 + }, + { + "epoch": 1.4522518364678283, + "grad_norm": 1.0902308225631714, + "learning_rate": 2.6256107954494242e-05, + "loss": 1.1832, + "step": 4720 + }, + { + "epoch": 1.4537902388369677, + "grad_norm": 1.2977639436721802, + "learning_rate": 2.6215880384236818e-05, + "loss": 1.2232, + "step": 4725 + }, + { + "epoch": 1.4553286412061075, + "grad_norm": 1.2228929996490479, + "learning_rate": 2.6175649658093586e-05, + "loss": 1.0803, + "step": 4730 + }, + { + "epoch": 1.456867043575247, + "grad_norm": 1.099314570426941, + "learning_rate": 2.6135415880485654e-05, + "loss": 1.1727, + "step": 4735 + }, + { + "epoch": 1.4584054459443867, + "grad_norm": 1.762929081916809, + "learning_rate": 2.609517915584204e-05, + "loss": 1.2126, + "step": 4740 + }, + { + "epoch": 1.4599438483135265, + "grad_norm": 1.0587412118911743, + "learning_rate": 2.6054939588599448e-05, + "loss": 1.1546, + "step": 4745 + }, + { + "epoch": 1.4614822506826661, + "grad_norm": 1.0290122032165527, + "learning_rate": 2.6014697283201907e-05, + "loss": 1.0129, + "step": 4750 + }, + { + "epoch": 1.4630206530518057, + "grad_norm": 1.801530361175537, + "learning_rate": 2.597445234410058e-05, + "loss": 1.2372, + "step": 4755 + }, + { + "epoch": 1.4645590554209453, + "grad_norm": 2.0911014080047607, + "learning_rate": 2.5934204875753494e-05, + "loss": 1.0657, + "step": 4760 + }, + { + "epoch": 1.466097457790085, + "grad_norm": 1.0208872556686401, + "learning_rate": 2.589395498262519e-05, + "loss": 1.2039, + "step": 4765 + }, + { + "epoch": 1.4676358601592248, + "grad_norm": 1.2312899827957153, + "learning_rate": 2.5853702769186528e-05, + "loss": 1.1466, + "step": 4770 + }, + { + "epoch": 1.4691742625283644, + "grad_norm": 1.3418748378753662, + "learning_rate": 2.5813448339914393e-05, + "loss": 1.1439, + "step": 4775 + }, + { + "epoch": 1.470712664897504, + "grad_norm": 1.0246156454086304, + "learning_rate": 2.5773191799291417e-05, + "loss": 1.1856, + "step": 4780 + }, + { + "epoch": 1.4722510672666436, + "grad_norm": 1.1938049793243408, + "learning_rate": 2.5732933251805713e-05, + "loss": 1.1144, + "step": 4785 + }, + { + "epoch": 1.4737894696357832, + "grad_norm": 2.593672037124634, + "learning_rate": 2.569267280195059e-05, + "loss": 1.1321, + "step": 4790 + }, + { + "epoch": 1.475327872004923, + "grad_norm": 1.1651784181594849, + "learning_rate": 2.5652410554224322e-05, + "loss": 1.1729, + "step": 4795 + }, + { + "epoch": 1.4768662743740626, + "grad_norm": 1.3578174114227295, + "learning_rate": 2.5612146613129828e-05, + "loss": 1.0952, + "step": 4800 + }, + { + "epoch": 1.4784046767432022, + "grad_norm": 2.145015001296997, + "learning_rate": 2.5571881083174427e-05, + "loss": 1.0512, + "step": 4805 + }, + { + "epoch": 1.4799430791123418, + "grad_norm": 1.4912582635879517, + "learning_rate": 2.553161406886955e-05, + "loss": 1.0594, + "step": 4810 + }, + { + "epoch": 1.4814814814814814, + "grad_norm": 1.3633592128753662, + "learning_rate": 2.5491345674730522e-05, + "loss": 1.2463, + "step": 4815 + }, + { + "epoch": 1.4830198838506212, + "grad_norm": 1.1500188112258911, + "learning_rate": 2.5451076005276197e-05, + "loss": 1.1274, + "step": 4820 + }, + { + "epoch": 1.4845582862197608, + "grad_norm": 1.1924505233764648, + "learning_rate": 2.5410805165028772e-05, + "loss": 1.0678, + "step": 4825 + }, + { + "epoch": 1.4860966885889004, + "grad_norm": 2.115426540374756, + "learning_rate": 2.537053325851348e-05, + "loss": 1.2476, + "step": 4830 + }, + { + "epoch": 1.48763509095804, + "grad_norm": 1.191912055015564, + "learning_rate": 2.5330260390258302e-05, + "loss": 1.1183, + "step": 4835 + }, + { + "epoch": 1.4891734933271796, + "grad_norm": 0.8928827047348022, + "learning_rate": 2.5289986664793743e-05, + "loss": 1.1317, + "step": 4840 + }, + { + "epoch": 1.4907118956963195, + "grad_norm": 1.3792165517807007, + "learning_rate": 2.52497121866525e-05, + "loss": 1.2063, + "step": 4845 + }, + { + "epoch": 1.492250298065459, + "grad_norm": 1.0137156248092651, + "learning_rate": 2.520943706036927e-05, + "loss": 1.1008, + "step": 4850 + }, + { + "epoch": 1.4937887004345987, + "grad_norm": 1.1347684860229492, + "learning_rate": 2.5169161390480382e-05, + "loss": 1.1237, + "step": 4855 + }, + { + "epoch": 1.4953271028037383, + "grad_norm": 1.1808063983917236, + "learning_rate": 2.5128885281523606e-05, + "loss": 1.242, + "step": 4860 + }, + { + "epoch": 1.4968655051728779, + "grad_norm": 1.2332245111465454, + "learning_rate": 2.508860883803784e-05, + "loss": 1.1619, + "step": 4865 + }, + { + "epoch": 1.4984039075420177, + "grad_norm": 1.0898313522338867, + "learning_rate": 2.5048332164562872e-05, + "loss": 1.2496, + "step": 4870 + }, + { + "epoch": 1.4999423099111573, + "grad_norm": 1.0363882780075073, + "learning_rate": 2.500805536563905e-05, + "loss": 1.1321, + "step": 4875 + }, + { + "epoch": 1.501480712280297, + "grad_norm": 1.2297104597091675, + "learning_rate": 2.4967778545807074e-05, + "loss": 1.1335, + "step": 4880 + }, + { + "epoch": 1.5030191146494367, + "grad_norm": 1.6471554040908813, + "learning_rate": 2.4927501809607692e-05, + "loss": 1.1601, + "step": 4885 + }, + { + "epoch": 1.504557517018576, + "grad_norm": 1.9324811697006226, + "learning_rate": 2.4887225261581436e-05, + "loss": 1.1363, + "step": 4890 + }, + { + "epoch": 1.506095919387716, + "grad_norm": 1.3468040227890015, + "learning_rate": 2.4846949006268344e-05, + "loss": 1.2231, + "step": 4895 + }, + { + "epoch": 1.5076343217568555, + "grad_norm": 1.3519035577774048, + "learning_rate": 2.4806673148207693e-05, + "loss": 1.2626, + "step": 4900 + }, + { + "epoch": 1.5091727241259951, + "grad_norm": 0.9620422720909119, + "learning_rate": 2.476639779193776e-05, + "loss": 1.2636, + "step": 4905 + }, + { + "epoch": 1.510711126495135, + "grad_norm": 1.1294174194335938, + "learning_rate": 2.4726123041995463e-05, + "loss": 1.234, + "step": 4910 + }, + { + "epoch": 1.5122495288642743, + "grad_norm": 1.0007851123809814, + "learning_rate": 2.4685849002916183e-05, + "loss": 1.1924, + "step": 4915 + }, + { + "epoch": 1.5137879312334142, + "grad_norm": 1.8807976245880127, + "learning_rate": 2.4645575779233464e-05, + "loss": 1.2864, + "step": 4920 + }, + { + "epoch": 1.5153263336025538, + "grad_norm": 2.8548858165740967, + "learning_rate": 2.460530347547871e-05, + "loss": 1.1517, + "step": 4925 + }, + { + "epoch": 1.5168647359716934, + "grad_norm": 1.5825186967849731, + "learning_rate": 2.4565032196180952e-05, + "loss": 1.2202, + "step": 4930 + }, + { + "epoch": 1.5184031383408332, + "grad_norm": 1.0249121189117432, + "learning_rate": 2.4524762045866555e-05, + "loss": 1.2323, + "step": 4935 + }, + { + "epoch": 1.5199415407099726, + "grad_norm": 0.9302557706832886, + "learning_rate": 2.4484493129058944e-05, + "loss": 1.2025, + "step": 4940 + }, + { + "epoch": 1.5214799430791124, + "grad_norm": 2.6338999271392822, + "learning_rate": 2.444422555027837e-05, + "loss": 1.1265, + "step": 4945 + }, + { + "epoch": 1.523018345448252, + "grad_norm": 1.4891122579574585, + "learning_rate": 2.4403959414041583e-05, + "loss": 1.0759, + "step": 4950 + }, + { + "epoch": 1.5245567478173916, + "grad_norm": 1.1123028993606567, + "learning_rate": 2.4363694824861615e-05, + "loss": 1.0696, + "step": 4955 + }, + { + "epoch": 1.5260951501865314, + "grad_norm": 2.104362964630127, + "learning_rate": 2.4323431887247446e-05, + "loss": 1.0459, + "step": 4960 + }, + { + "epoch": 1.5276335525556708, + "grad_norm": 0.9691029787063599, + "learning_rate": 2.4283170705703812e-05, + "loss": 1.2534, + "step": 4965 + }, + { + "epoch": 1.5291719549248106, + "grad_norm": 1.148000717163086, + "learning_rate": 2.424291138473085e-05, + "loss": 1.2542, + "step": 4970 + }, + { + "epoch": 1.5307103572939502, + "grad_norm": 0.9757530093193054, + "learning_rate": 2.4202654028823913e-05, + "loss": 1.087, + "step": 4975 + }, + { + "epoch": 1.5322487596630898, + "grad_norm": 1.5252931118011475, + "learning_rate": 2.4162398742473214e-05, + "loss": 1.2709, + "step": 4980 + }, + { + "epoch": 1.5337871620322296, + "grad_norm": 1.3991789817810059, + "learning_rate": 2.4122145630163616e-05, + "loss": 1.0422, + "step": 4985 + }, + { + "epoch": 1.535325564401369, + "grad_norm": 1.2912970781326294, + "learning_rate": 2.408189479637432e-05, + "loss": 1.1019, + "step": 4990 + }, + { + "epoch": 1.5368639667705088, + "grad_norm": 1.0709604024887085, + "learning_rate": 2.4041646345578637e-05, + "loss": 1.1101, + "step": 4995 + }, + { + "epoch": 1.5384023691396485, + "grad_norm": 1.062154769897461, + "learning_rate": 2.4001400382243675e-05, + "loss": 1.1262, + "step": 5000 + }, + { + "epoch": 1.539940771508788, + "grad_norm": 1.4915518760681152, + "learning_rate": 2.3961157010830095e-05, + "loss": 1.2232, + "step": 5005 + }, + { + "epoch": 1.5414791738779279, + "grad_norm": 1.2963896989822388, + "learning_rate": 2.3920916335791833e-05, + "loss": 1.1251, + "step": 5010 + }, + { + "epoch": 1.5430175762470673, + "grad_norm": 1.115599274635315, + "learning_rate": 2.3880678461575805e-05, + "loss": 1.1748, + "step": 5015 + }, + { + "epoch": 1.544555978616207, + "grad_norm": 1.3244112730026245, + "learning_rate": 2.3840443492621674e-05, + "loss": 1.1706, + "step": 5020 + }, + { + "epoch": 1.5460943809853467, + "grad_norm": 1.903247356414795, + "learning_rate": 2.380021153336158e-05, + "loss": 1.1706, + "step": 5025 + }, + { + "epoch": 1.5476327833544863, + "grad_norm": 2.5160722732543945, + "learning_rate": 2.375998268821982e-05, + "loss": 1.0949, + "step": 5030 + }, + { + "epoch": 1.549171185723626, + "grad_norm": 1.065032958984375, + "learning_rate": 2.371975706161262e-05, + "loss": 1.1651, + "step": 5035 + }, + { + "epoch": 1.5507095880927657, + "grad_norm": 1.1798903942108154, + "learning_rate": 2.3679534757947862e-05, + "loss": 1.0752, + "step": 5040 + }, + { + "epoch": 1.5522479904619053, + "grad_norm": 1.417855143547058, + "learning_rate": 2.3639315881624777e-05, + "loss": 1.204, + "step": 5045 + }, + { + "epoch": 1.553786392831045, + "grad_norm": 3.4984400272369385, + "learning_rate": 2.3599100537033728e-05, + "loss": 1.2201, + "step": 5050 + }, + { + "epoch": 1.5553247952001845, + "grad_norm": 1.7004238367080688, + "learning_rate": 2.35588888285559e-05, + "loss": 1.2381, + "step": 5055 + }, + { + "epoch": 1.5568631975693243, + "grad_norm": 1.1954479217529297, + "learning_rate": 2.3518680860563026e-05, + "loss": 1.291, + "step": 5060 + }, + { + "epoch": 1.558401599938464, + "grad_norm": 1.0503448247909546, + "learning_rate": 2.3478476737417177e-05, + "loss": 1.114, + "step": 5065 + }, + { + "epoch": 1.5599400023076035, + "grad_norm": 1.9155570268630981, + "learning_rate": 2.3438276563470382e-05, + "loss": 1.2755, + "step": 5070 + }, + { + "epoch": 1.5614784046767431, + "grad_norm": 1.2648513317108154, + "learning_rate": 2.3398080443064453e-05, + "loss": 1.1355, + "step": 5075 + }, + { + "epoch": 1.5630168070458828, + "grad_norm": 1.9838799238204956, + "learning_rate": 2.335788848053069e-05, + "loss": 1.1396, + "step": 5080 + }, + { + "epoch": 1.5645552094150226, + "grad_norm": 0.9827559590339661, + "learning_rate": 2.331770078018958e-05, + "loss": 1.1287, + "step": 5085 + }, + { + "epoch": 1.5660936117841622, + "grad_norm": 1.3725968599319458, + "learning_rate": 2.3277517446350566e-05, + "loss": 1.1653, + "step": 5090 + }, + { + "epoch": 1.5676320141533018, + "grad_norm": 1.1363013982772827, + "learning_rate": 2.3237338583311742e-05, + "loss": 1.1403, + "step": 5095 + }, + { + "epoch": 1.5691704165224416, + "grad_norm": 1.514285922050476, + "learning_rate": 2.3197164295359593e-05, + "loss": 1.1604, + "step": 5100 + }, + { + "epoch": 1.570708818891581, + "grad_norm": 1.5780757665634155, + "learning_rate": 2.3156994686768753e-05, + "loss": 1.1634, + "step": 5105 + }, + { + "epoch": 1.5722472212607208, + "grad_norm": 2.1901051998138428, + "learning_rate": 2.3116829861801686e-05, + "loss": 1.0262, + "step": 5110 + }, + { + "epoch": 1.5737856236298604, + "grad_norm": 0.9369887113571167, + "learning_rate": 2.307666992470845e-05, + "loss": 1.1561, + "step": 5115 + }, + { + "epoch": 1.575324025999, + "grad_norm": 1.2366549968719482, + "learning_rate": 2.3036514979726442e-05, + "loss": 1.1065, + "step": 5120 + }, + { + "epoch": 1.5768624283681398, + "grad_norm": 1.257912516593933, + "learning_rate": 2.2996365131080046e-05, + "loss": 1.1071, + "step": 5125 + }, + { + "epoch": 1.5784008307372792, + "grad_norm": 1.1271820068359375, + "learning_rate": 2.295622048298045e-05, + "loss": 1.1964, + "step": 5130 + }, + { + "epoch": 1.579939233106419, + "grad_norm": 1.9496513605117798, + "learning_rate": 2.2916081139625362e-05, + "loss": 1.1061, + "step": 5135 + }, + { + "epoch": 1.5814776354755586, + "grad_norm": 1.8635252714157104, + "learning_rate": 2.287594720519869e-05, + "loss": 1.0035, + "step": 5140 + }, + { + "epoch": 1.5830160378446982, + "grad_norm": 0.9544889330863953, + "learning_rate": 2.2835818783870312e-05, + "loss": 1.2609, + "step": 5145 + }, + { + "epoch": 1.584554440213838, + "grad_norm": 1.032304048538208, + "learning_rate": 2.2795695979795813e-05, + "loss": 1.2609, + "step": 5150 + }, + { + "epoch": 1.5860928425829774, + "grad_norm": 1.1428989171981812, + "learning_rate": 2.275557889711617e-05, + "loss": 1.1503, + "step": 5155 + }, + { + "epoch": 1.5876312449521173, + "grad_norm": 1.0252608060836792, + "learning_rate": 2.271546763995752e-05, + "loss": 1.0164, + "step": 5160 + }, + { + "epoch": 1.5891696473212569, + "grad_norm": 0.9309441447257996, + "learning_rate": 2.2675362312430894e-05, + "loss": 1.118, + "step": 5165 + }, + { + "epoch": 1.5907080496903965, + "grad_norm": 1.2354007959365845, + "learning_rate": 2.2635263018631915e-05, + "loss": 1.1657, + "step": 5170 + }, + { + "epoch": 1.5922464520595363, + "grad_norm": 1.468483805656433, + "learning_rate": 2.2595169862640568e-05, + "loss": 1.1834, + "step": 5175 + }, + { + "epoch": 1.5937848544286757, + "grad_norm": 1.2876496315002441, + "learning_rate": 2.255508294852086e-05, + "loss": 1.152, + "step": 5180 + }, + { + "epoch": 1.5953232567978155, + "grad_norm": 1.2103363275527954, + "learning_rate": 2.2515002380320655e-05, + "loss": 1.3024, + "step": 5185 + }, + { + "epoch": 1.596861659166955, + "grad_norm": 1.977842926979065, + "learning_rate": 2.2474928262071307e-05, + "loss": 1.1682, + "step": 5190 + }, + { + "epoch": 1.5984000615360947, + "grad_norm": 2.3454501628875732, + "learning_rate": 2.243486069778744e-05, + "loss": 1.1026, + "step": 5195 + }, + { + "epoch": 1.5999384639052345, + "grad_norm": 1.8084458112716675, + "learning_rate": 2.239479979146667e-05, + "loss": 1.1451, + "step": 5200 + }, + { + "epoch": 1.601476866274374, + "grad_norm": 1.0935570001602173, + "learning_rate": 2.235474564708933e-05, + "loss": 0.9931, + "step": 5205 + }, + { + "epoch": 1.6030152686435137, + "grad_norm": 1.21275794506073, + "learning_rate": 2.2314698368618198e-05, + "loss": 1.2305, + "step": 5210 + }, + { + "epoch": 1.6045536710126533, + "grad_norm": 1.6089550256729126, + "learning_rate": 2.227465805999823e-05, + "loss": 1.1902, + "step": 5215 + }, + { + "epoch": 1.606092073381793, + "grad_norm": 1.717024803161621, + "learning_rate": 2.2234624825156293e-05, + "loss": 1.204, + "step": 5220 + }, + { + "epoch": 1.6076304757509328, + "grad_norm": 1.2060118913650513, + "learning_rate": 2.219459876800091e-05, + "loss": 1.1242, + "step": 5225 + }, + { + "epoch": 1.6091688781200721, + "grad_norm": 0.8598418831825256, + "learning_rate": 2.2154579992421964e-05, + "loss": 1.0447, + "step": 5230 + }, + { + "epoch": 1.610707280489212, + "grad_norm": 0.9737907648086548, + "learning_rate": 2.2114568602290406e-05, + "loss": 1.1645, + "step": 5235 + }, + { + "epoch": 1.6122456828583516, + "grad_norm": 1.1507982015609741, + "learning_rate": 2.2074564701458065e-05, + "loss": 1.1268, + "step": 5240 + }, + { + "epoch": 1.6137840852274912, + "grad_norm": 1.7262670993804932, + "learning_rate": 2.2034568393757313e-05, + "loss": 1.1256, + "step": 5245 + }, + { + "epoch": 1.615322487596631, + "grad_norm": 1.7809712886810303, + "learning_rate": 2.1994579783000804e-05, + "loss": 1.2046, + "step": 5250 + }, + { + "epoch": 1.6168608899657706, + "grad_norm": 2.4736218452453613, + "learning_rate": 2.1954598972981237e-05, + "loss": 1.1503, + "step": 5255 + }, + { + "epoch": 1.6183992923349102, + "grad_norm": 1.1869739294052124, + "learning_rate": 2.1914626067471032e-05, + "loss": 1.0835, + "step": 5260 + }, + { + "epoch": 1.6199376947040498, + "grad_norm": 1.188988447189331, + "learning_rate": 2.187466117022212e-05, + "loss": 1.0705, + "step": 5265 + }, + { + "epoch": 1.6214760970731894, + "grad_norm": 1.4258966445922852, + "learning_rate": 2.183470438496563e-05, + "loss": 1.1984, + "step": 5270 + }, + { + "epoch": 1.6230144994423292, + "grad_norm": 0.839988112449646, + "learning_rate": 2.1794755815411642e-05, + "loss": 1.1258, + "step": 5275 + }, + { + "epoch": 1.6245529018114688, + "grad_norm": 1.4149938821792603, + "learning_rate": 2.175481556524892e-05, + "loss": 1.1313, + "step": 5280 + }, + { + "epoch": 1.6260913041806084, + "grad_norm": 1.8420780897140503, + "learning_rate": 2.1714883738144627e-05, + "loss": 1.1325, + "step": 5285 + }, + { + "epoch": 1.627629706549748, + "grad_norm": 1.2860522270202637, + "learning_rate": 2.1674960437744044e-05, + "loss": 1.2157, + "step": 5290 + }, + { + "epoch": 1.6291681089188876, + "grad_norm": 1.9698175191879272, + "learning_rate": 2.1635045767670356e-05, + "loss": 1.1402, + "step": 5295 + }, + { + "epoch": 1.6307065112880275, + "grad_norm": 1.716629147529602, + "learning_rate": 2.1595139831524326e-05, + "loss": 1.1031, + "step": 5300 + }, + { + "epoch": 1.632244913657167, + "grad_norm": 1.4469795227050781, + "learning_rate": 2.155524273288405e-05, + "loss": 1.1826, + "step": 5305 + }, + { + "epoch": 1.6337833160263067, + "grad_norm": 1.2318928241729736, + "learning_rate": 2.1515354575304695e-05, + "loss": 1.1256, + "step": 5310 + }, + { + "epoch": 1.6353217183954465, + "grad_norm": 1.3652349710464478, + "learning_rate": 2.1475475462318202e-05, + "loss": 1.2348, + "step": 5315 + }, + { + "epoch": 1.6368601207645859, + "grad_norm": 1.40614914894104, + "learning_rate": 2.1435605497433057e-05, + "loss": 1.1109, + "step": 5320 + }, + { + "epoch": 1.6383985231337257, + "grad_norm": 1.7324262857437134, + "learning_rate": 2.139574478413398e-05, + "loss": 1.0686, + "step": 5325 + }, + { + "epoch": 1.6399369255028653, + "grad_norm": 1.711517333984375, + "learning_rate": 2.135589342588171e-05, + "loss": 0.9758, + "step": 5330 + }, + { + "epoch": 1.641475327872005, + "grad_norm": 1.4670315980911255, + "learning_rate": 2.1316051526112672e-05, + "loss": 1.0558, + "step": 5335 + }, + { + "epoch": 1.6430137302411447, + "grad_norm": 1.625055193901062, + "learning_rate": 2.1276219188238768e-05, + "loss": 1.1067, + "step": 5340 + }, + { + "epoch": 1.644552132610284, + "grad_norm": 2.5813450813293457, + "learning_rate": 2.1236396515647046e-05, + "loss": 1.0579, + "step": 5345 + }, + { + "epoch": 1.646090534979424, + "grad_norm": 1.4112730026245117, + "learning_rate": 2.1196583611699503e-05, + "loss": 1.2685, + "step": 5350 + }, + { + "epoch": 1.6476289373485635, + "grad_norm": 0.8786386251449585, + "learning_rate": 2.1156780579732764e-05, + "loss": 1.1172, + "step": 5355 + }, + { + "epoch": 1.6491673397177031, + "grad_norm": 0.9437426328659058, + "learning_rate": 2.111698752305783e-05, + "loss": 1.0846, + "step": 5360 + }, + { + "epoch": 1.650705742086843, + "grad_norm": 1.353835940361023, + "learning_rate": 2.1077204544959825e-05, + "loss": 1.1903, + "step": 5365 + }, + { + "epoch": 1.6522441444559823, + "grad_norm": 1.4217884540557861, + "learning_rate": 2.1037431748697688e-05, + "loss": 1.0995, + "step": 5370 + }, + { + "epoch": 1.6537825468251222, + "grad_norm": 1.296500325202942, + "learning_rate": 2.099766923750395e-05, + "loss": 1.1283, + "step": 5375 + }, + { + "epoch": 1.6553209491942618, + "grad_norm": 1.42235267162323, + "learning_rate": 2.095791711458444e-05, + "loss": 1.1127, + "step": 5380 + }, + { + "epoch": 1.6568593515634014, + "grad_norm": 1.1460708379745483, + "learning_rate": 2.0918175483118036e-05, + "loss": 1.0967, + "step": 5385 + }, + { + "epoch": 1.6583977539325412, + "grad_norm": 1.1948250532150269, + "learning_rate": 2.0878444446256364e-05, + "loss": 1.2248, + "step": 5390 + }, + { + "epoch": 1.6599361563016806, + "grad_norm": 0.8646104335784912, + "learning_rate": 2.083872410712357e-05, + "loss": 1.1827, + "step": 5395 + }, + { + "epoch": 1.6614745586708204, + "grad_norm": 1.0145443677902222, + "learning_rate": 2.079901456881601e-05, + "loss": 1.0875, + "step": 5400 + }, + { + "epoch": 1.66301296103996, + "grad_norm": 1.4740755558013916, + "learning_rate": 2.075931593440203e-05, + "loss": 1.1525, + "step": 5405 + }, + { + "epoch": 1.6645513634090996, + "grad_norm": 1.4376249313354492, + "learning_rate": 2.0719628306921664e-05, + "loss": 1.1863, + "step": 5410 + }, + { + "epoch": 1.6660897657782394, + "grad_norm": 1.4571655988693237, + "learning_rate": 2.067995178938638e-05, + "loss": 0.8931, + "step": 5415 + }, + { + "epoch": 1.6676281681473788, + "grad_norm": 1.8079279661178589, + "learning_rate": 2.0640286484778804e-05, + "loss": 0.9931, + "step": 5420 + }, + { + "epoch": 1.6691665705165186, + "grad_norm": 1.1777446269989014, + "learning_rate": 2.0600632496052457e-05, + "loss": 1.2278, + "step": 5425 + }, + { + "epoch": 1.6707049728856582, + "grad_norm": 1.0672292709350586, + "learning_rate": 2.05609899261315e-05, + "loss": 1.2236, + "step": 5430 + }, + { + "epoch": 1.6722433752547978, + "grad_norm": 1.5936694145202637, + "learning_rate": 2.0521358877910444e-05, + "loss": 1.2029, + "step": 5435 + }, + { + "epoch": 1.6737817776239376, + "grad_norm": 1.6731094121932983, + "learning_rate": 2.0481739454253904e-05, + "loss": 1.1809, + "step": 5440 + }, + { + "epoch": 1.675320179993077, + "grad_norm": 2.1021690368652344, + "learning_rate": 2.044213175799632e-05, + "loss": 1.1373, + "step": 5445 + }, + { + "epoch": 1.6768585823622169, + "grad_norm": 2.2051169872283936, + "learning_rate": 2.0402535891941695e-05, + "loss": 1.1215, + "step": 5450 + }, + { + "epoch": 1.6783969847313565, + "grad_norm": 2.0235435962677, + "learning_rate": 2.0362951958863306e-05, + "loss": 1.0841, + "step": 5455 + }, + { + "epoch": 1.679935387100496, + "grad_norm": 1.1775131225585938, + "learning_rate": 2.0323380061503494e-05, + "loss": 1.0182, + "step": 5460 + }, + { + "epoch": 1.6814737894696359, + "grad_norm": 1.5589635372161865, + "learning_rate": 2.0283820302573327e-05, + "loss": 1.1266, + "step": 5465 + }, + { + "epoch": 1.6830121918387755, + "grad_norm": 1.2760752439498901, + "learning_rate": 2.024427278475239e-05, + "loss": 1.0757, + "step": 5470 + }, + { + "epoch": 1.684550594207915, + "grad_norm": 1.4246011972427368, + "learning_rate": 2.0204737610688482e-05, + "loss": 1.0888, + "step": 5475 + }, + { + "epoch": 1.6860889965770547, + "grad_norm": 1.7687841653823853, + "learning_rate": 2.0165214882997363e-05, + "loss": 1.1195, + "step": 5480 + }, + { + "epoch": 1.6876273989461943, + "grad_norm": 1.873183012008667, + "learning_rate": 2.012570470426249e-05, + "loss": 1.2103, + "step": 5485 + }, + { + "epoch": 1.6891658013153341, + "grad_norm": 1.3162018060684204, + "learning_rate": 2.0086207177034765e-05, + "loss": 1.1606, + "step": 5490 + }, + { + "epoch": 1.6907042036844737, + "grad_norm": 1.9198535680770874, + "learning_rate": 2.0046722403832227e-05, + "loss": 1.1806, + "step": 5495 + }, + { + "epoch": 1.6922426060536133, + "grad_norm": 1.908752202987671, + "learning_rate": 2.000725048713983e-05, + "loss": 1.2071, + "step": 5500 + }, + { + "epoch": 1.693781008422753, + "grad_norm": 1.1114752292633057, + "learning_rate": 1.996779152940914e-05, + "loss": 1.1524, + "step": 5505 + }, + { + "epoch": 1.6953194107918925, + "grad_norm": 1.055211067199707, + "learning_rate": 1.99283456330581e-05, + "loss": 1.1581, + "step": 5510 + }, + { + "epoch": 1.6968578131610323, + "grad_norm": 3.748779773712158, + "learning_rate": 1.988891290047075e-05, + "loss": 1.1191, + "step": 5515 + }, + { + "epoch": 1.698396215530172, + "grad_norm": 1.2784082889556885, + "learning_rate": 1.9849493433996963e-05, + "loss": 1.0007, + "step": 5520 + }, + { + "epoch": 1.6999346178993116, + "grad_norm": 1.2205626964569092, + "learning_rate": 1.9810087335952172e-05, + "loss": 1.1451, + "step": 5525 + }, + { + "epoch": 1.7014730202684514, + "grad_norm": 0.999777615070343, + "learning_rate": 1.977069470861714e-05, + "loss": 1.1493, + "step": 5530 + }, + { + "epoch": 1.7030114226375908, + "grad_norm": 1.3446388244628906, + "learning_rate": 1.9731315654237613e-05, + "loss": 1.1549, + "step": 5535 + }, + { + "epoch": 1.7045498250067306, + "grad_norm": 1.1934943199157715, + "learning_rate": 1.9691950275024144e-05, + "loss": 1.2842, + "step": 5540 + }, + { + "epoch": 1.7060882273758702, + "grad_norm": 1.0282137393951416, + "learning_rate": 1.9652598673151798e-05, + "loss": 0.9822, + "step": 5545 + }, + { + "epoch": 1.7076266297450098, + "grad_norm": 1.6977639198303223, + "learning_rate": 1.961326095075986e-05, + "loss": 1.1677, + "step": 5550 + }, + { + "epoch": 1.7091650321141496, + "grad_norm": 1.1296579837799072, + "learning_rate": 1.9573937209951604e-05, + "loss": 1.1473, + "step": 5555 + }, + { + "epoch": 1.710703434483289, + "grad_norm": 1.5597577095031738, + "learning_rate": 1.9534627552793998e-05, + "loss": 1.1344, + "step": 5560 + }, + { + "epoch": 1.7122418368524288, + "grad_norm": 1.7576704025268555, + "learning_rate": 1.9495332081317464e-05, + "loss": 1.1523, + "step": 5565 + }, + { + "epoch": 1.7137802392215684, + "grad_norm": 0.7898119688034058, + "learning_rate": 1.945605089751561e-05, + "loss": 1.2694, + "step": 5570 + }, + { + "epoch": 1.715318641590708, + "grad_norm": 1.6534613370895386, + "learning_rate": 1.9416784103344958e-05, + "loss": 1.2023, + "step": 5575 + }, + { + "epoch": 1.7168570439598478, + "grad_norm": 1.4977833032608032, + "learning_rate": 1.937753180072466e-05, + "loss": 1.134, + "step": 5580 + }, + { + "epoch": 1.7183954463289872, + "grad_norm": 1.0111984014511108, + "learning_rate": 1.93382940915363e-05, + "loss": 1.1758, + "step": 5585 + }, + { + "epoch": 1.719933848698127, + "grad_norm": 1.2132163047790527, + "learning_rate": 1.9299071077623536e-05, + "loss": 1.0628, + "step": 5590 + }, + { + "epoch": 1.7214722510672666, + "grad_norm": 1.3343091011047363, + "learning_rate": 1.9259862860791894e-05, + "loss": 1.0998, + "step": 5595 + }, + { + "epoch": 1.7230106534364062, + "grad_norm": 1.314704179763794, + "learning_rate": 1.922066954280852e-05, + "loss": 1.1529, + "step": 5600 + }, + { + "epoch": 1.724549055805546, + "grad_norm": 1.7054861783981323, + "learning_rate": 1.918149122540187e-05, + "loss": 1.1795, + "step": 5605 + }, + { + "epoch": 1.7260874581746855, + "grad_norm": 1.477265477180481, + "learning_rate": 1.9142328010261463e-05, + "loss": 1.1547, + "step": 5610 + }, + { + "epoch": 1.7276258605438253, + "grad_norm": 3.371182918548584, + "learning_rate": 1.910317999903762e-05, + "loss": 1.13, + "step": 5615 + }, + { + "epoch": 1.7291642629129649, + "grad_norm": 1.1603204011917114, + "learning_rate": 1.9064047293341205e-05, + "loss": 1.2422, + "step": 5620 + }, + { + "epoch": 1.7307026652821045, + "grad_norm": 1.587916612625122, + "learning_rate": 1.9024929994743354e-05, + "loss": 1.2718, + "step": 5625 + }, + { + "epoch": 1.7322410676512443, + "grad_norm": 1.9051862955093384, + "learning_rate": 1.8985828204775206e-05, + "loss": 1.0266, + "step": 5630 + }, + { + "epoch": 1.7337794700203837, + "grad_norm": 0.9715479016304016, + "learning_rate": 1.8946742024927662e-05, + "loss": 1.0988, + "step": 5635 + }, + { + "epoch": 1.7353178723895235, + "grad_norm": 1.471206545829773, + "learning_rate": 1.8907671556651102e-05, + "loss": 1.2839, + "step": 5640 + }, + { + "epoch": 1.736856274758663, + "grad_norm": 1.127846598625183, + "learning_rate": 1.8868616901355096e-05, + "loss": 1.084, + "step": 5645 + }, + { + "epoch": 1.7383946771278027, + "grad_norm": 1.6066924333572388, + "learning_rate": 1.8829578160408216e-05, + "loss": 1.1079, + "step": 5650 + }, + { + "epoch": 1.7399330794969425, + "grad_norm": 1.393292784690857, + "learning_rate": 1.8790555435137697e-05, + "loss": 1.0559, + "step": 5655 + }, + { + "epoch": 1.741471481866082, + "grad_norm": 1.9487007856369019, + "learning_rate": 1.875154882682922e-05, + "loss": 1.1889, + "step": 5660 + }, + { + "epoch": 1.7430098842352217, + "grad_norm": 0.9660601019859314, + "learning_rate": 1.8712558436726623e-05, + "loss": 1.1461, + "step": 5665 + }, + { + "epoch": 1.7445482866043613, + "grad_norm": 0.9336276054382324, + "learning_rate": 1.8673584366031647e-05, + "loss": 1.1877, + "step": 5670 + }, + { + "epoch": 1.746086688973501, + "grad_norm": 1.3800424337387085, + "learning_rate": 1.8634626715903693e-05, + "loss": 1.2214, + "step": 5675 + }, + { + "epoch": 1.7476250913426408, + "grad_norm": 0.946338415145874, + "learning_rate": 1.8595685587459522e-05, + "loss": 1.2095, + "step": 5680 + }, + { + "epoch": 1.7491634937117804, + "grad_norm": 2.1087701320648193, + "learning_rate": 1.8556761081773013e-05, + "loss": 1.0836, + "step": 5685 + }, + { + "epoch": 1.75070189608092, + "grad_norm": 0.8675004243850708, + "learning_rate": 1.851785329987492e-05, + "loss": 1.1866, + "step": 5690 + }, + { + "epoch": 1.7522402984500596, + "grad_norm": 1.0243099927902222, + "learning_rate": 1.8478962342752583e-05, + "loss": 1.2131, + "step": 5695 + }, + { + "epoch": 1.7537787008191992, + "grad_norm": 1.2281181812286377, + "learning_rate": 1.8440088311349634e-05, + "loss": 1.2623, + "step": 5700 + }, + { + "epoch": 1.755317103188339, + "grad_norm": 1.4797272682189941, + "learning_rate": 1.840123130656583e-05, + "loss": 1.2044, + "step": 5705 + }, + { + "epoch": 1.7568555055574786, + "grad_norm": 1.8646970987319946, + "learning_rate": 1.8362391429256698e-05, + "loss": 1.152, + "step": 5710 + }, + { + "epoch": 1.7583939079266182, + "grad_norm": 1.331153154373169, + "learning_rate": 1.8323568780233325e-05, + "loss": 1.2001, + "step": 5715 + }, + { + "epoch": 1.7599323102957578, + "grad_norm": 1.3193144798278809, + "learning_rate": 1.8284763460262085e-05, + "loss": 1.2019, + "step": 5720 + }, + { + "epoch": 1.7614707126648974, + "grad_norm": 2.1348347663879395, + "learning_rate": 1.824597557006434e-05, + "loss": 1.1155, + "step": 5725 + }, + { + "epoch": 1.7630091150340372, + "grad_norm": 0.8556899428367615, + "learning_rate": 1.820720521031626e-05, + "loss": 1.208, + "step": 5730 + }, + { + "epoch": 1.7645475174031768, + "grad_norm": 1.1282209157943726, + "learning_rate": 1.8168452481648476e-05, + "loss": 1.2334, + "step": 5735 + }, + { + "epoch": 1.7660859197723164, + "grad_norm": 1.082572102546692, + "learning_rate": 1.8129717484645876e-05, + "loss": 1.0867, + "step": 5740 + }, + { + "epoch": 1.7676243221414563, + "grad_norm": 1.2311683893203735, + "learning_rate": 1.809100031984734e-05, + "loss": 1.2453, + "step": 5745 + }, + { + "epoch": 1.7691627245105956, + "grad_norm": 1.6507987976074219, + "learning_rate": 1.805230108774541e-05, + "loss": 1.1704, + "step": 5750 + }, + { + "epoch": 1.7707011268797355, + "grad_norm": 1.152384877204895, + "learning_rate": 1.8013619888786127e-05, + "loss": 1.2105, + "step": 5755 + }, + { + "epoch": 1.772239529248875, + "grad_norm": 1.057258129119873, + "learning_rate": 1.7974956823368727e-05, + "loss": 1.2076, + "step": 5760 + }, + { + "epoch": 1.7737779316180147, + "grad_norm": 0.8284866809844971, + "learning_rate": 1.7936311991845355e-05, + "loss": 1.1031, + "step": 5765 + }, + { + "epoch": 1.7753163339871545, + "grad_norm": 0.9494788646697998, + "learning_rate": 1.789768549452085e-05, + "loss": 1.1328, + "step": 5770 + }, + { + "epoch": 1.7768547363562939, + "grad_norm": 0.9356427192687988, + "learning_rate": 1.785907743165245e-05, + "loss": 1.1613, + "step": 5775 + }, + { + "epoch": 1.7783931387254337, + "grad_norm": 1.4174987077713013, + "learning_rate": 1.7820487903449544e-05, + "loss": 1.1835, + "step": 5780 + }, + { + "epoch": 1.7799315410945733, + "grad_norm": 0.9856932163238525, + "learning_rate": 1.778191701007343e-05, + "loss": 1.1883, + "step": 5785 + }, + { + "epoch": 1.781469943463713, + "grad_norm": 1.5140010118484497, + "learning_rate": 1.7743364851637017e-05, + "loss": 1.0919, + "step": 5790 + }, + { + "epoch": 1.7830083458328527, + "grad_norm": 1.0665122270584106, + "learning_rate": 1.7704831528204608e-05, + "loss": 1.2349, + "step": 5795 + }, + { + "epoch": 1.784546748201992, + "grad_norm": 1.7453125715255737, + "learning_rate": 1.7666317139791618e-05, + "loss": 1.0932, + "step": 5800 + }, + { + "epoch": 1.786085150571132, + "grad_norm": 1.5194388628005981, + "learning_rate": 1.7627821786364265e-05, + "loss": 1.1284, + "step": 5805 + }, + { + "epoch": 1.7876235529402715, + "grad_norm": 1.0937379598617554, + "learning_rate": 1.7589345567839433e-05, + "loss": 1.1492, + "step": 5810 + }, + { + "epoch": 1.7891619553094111, + "grad_norm": 1.1306276321411133, + "learning_rate": 1.75508885840843e-05, + "loss": 1.1348, + "step": 5815 + }, + { + "epoch": 1.790700357678551, + "grad_norm": 1.172468900680542, + "learning_rate": 1.7512450934916128e-05, + "loss": 1.1805, + "step": 5820 + }, + { + "epoch": 1.7922387600476903, + "grad_norm": 1.2849513292312622, + "learning_rate": 1.747403272010199e-05, + "loss": 1.2076, + "step": 5825 + }, + { + "epoch": 1.7937771624168302, + "grad_norm": 1.0420297384262085, + "learning_rate": 1.7435634039358527e-05, + "loss": 1.1641, + "step": 5830 + }, + { + "epoch": 1.7953155647859698, + "grad_norm": 0.999232828617096, + "learning_rate": 1.7397254992351662e-05, + "loss": 1.0551, + "step": 5835 + }, + { + "epoch": 1.7968539671551094, + "grad_norm": 1.499186635017395, + "learning_rate": 1.7358895678696368e-05, + "loss": 1.2383, + "step": 5840 + }, + { + "epoch": 1.7983923695242492, + "grad_norm": 1.5105886459350586, + "learning_rate": 1.73205561979564e-05, + "loss": 1.1292, + "step": 5845 + }, + { + "epoch": 1.7999307718933886, + "grad_norm": 1.0507991313934326, + "learning_rate": 1.7282236649644035e-05, + "loss": 1.1165, + "step": 5850 + }, + { + "epoch": 1.8014691742625284, + "grad_norm": 1.8862839937210083, + "learning_rate": 1.7243937133219818e-05, + "loss": 1.1018, + "step": 5855 + }, + { + "epoch": 1.803007576631668, + "grad_norm": 1.0384774208068848, + "learning_rate": 1.7205657748092275e-05, + "loss": 1.2132, + "step": 5860 + }, + { + "epoch": 1.8045459790008076, + "grad_norm": 1.6291218996047974, + "learning_rate": 1.716739859361771e-05, + "loss": 1.1987, + "step": 5865 + }, + { + "epoch": 1.8060843813699474, + "grad_norm": 0.9344973564147949, + "learning_rate": 1.712915976909992e-05, + "loss": 1.1942, + "step": 5870 + }, + { + "epoch": 1.8076227837390868, + "grad_norm": 1.1848701238632202, + "learning_rate": 1.7090941373789898e-05, + "loss": 1.2246, + "step": 5875 + }, + { + "epoch": 1.8091611861082266, + "grad_norm": 2.381784439086914, + "learning_rate": 1.7052743506885652e-05, + "loss": 1.1942, + "step": 5880 + }, + { + "epoch": 1.8106995884773662, + "grad_norm": 2.1836729049682617, + "learning_rate": 1.701456626753189e-05, + "loss": 1.1698, + "step": 5885 + }, + { + "epoch": 1.8122379908465058, + "grad_norm": 1.0437754392623901, + "learning_rate": 1.6976409754819767e-05, + "loss": 1.195, + "step": 5890 + }, + { + "epoch": 1.8137763932156457, + "grad_norm": 1.6375466585159302, + "learning_rate": 1.6938274067786663e-05, + "loss": 1.2909, + "step": 5895 + }, + { + "epoch": 1.815314795584785, + "grad_norm": 1.204934000968933, + "learning_rate": 1.6900159305415892e-05, + "loss": 1.1238, + "step": 5900 + }, + { + "epoch": 1.8168531979539249, + "grad_norm": 1.3872859477996826, + "learning_rate": 1.6862065566636466e-05, + "loss": 1.2359, + "step": 5905 + }, + { + "epoch": 1.8183916003230645, + "grad_norm": 1.733851432800293, + "learning_rate": 1.682399295032283e-05, + "loss": 1.1205, + "step": 5910 + }, + { + "epoch": 1.819930002692204, + "grad_norm": 1.224089503288269, + "learning_rate": 1.6785941555294573e-05, + "loss": 1.1741, + "step": 5915 + }, + { + "epoch": 1.8214684050613439, + "grad_norm": 1.3531755208969116, + "learning_rate": 1.675551578496907e-05, + "loss": 1.1854, + "step": 5920 + }, + { + "epoch": 1.8230068074304835, + "grad_norm": 1.2245914936065674, + "learning_rate": 1.6717502837103975e-05, + "loss": 1.1473, + "step": 5925 + }, + { + "epoch": 1.824545209799623, + "grad_norm": 1.7382968664169312, + "learning_rate": 1.6679511386925337e-05, + "loss": 1.1192, + "step": 5930 + }, + { + "epoch": 1.8260836121687627, + "grad_norm": 2.019582748413086, + "learning_rate": 1.6641541533042098e-05, + "loss": 1.1244, + "step": 5935 + }, + { + "epoch": 1.8276220145379023, + "grad_norm": 1.7222234010696411, + "learning_rate": 1.6603593374007153e-05, + "loss": 1.0861, + "step": 5940 + }, + { + "epoch": 1.8291604169070421, + "grad_norm": 1.1440834999084473, + "learning_rate": 1.656566700831708e-05, + "loss": 1.1462, + "step": 5945 + }, + { + "epoch": 1.8306988192761817, + "grad_norm": 2.6508567333221436, + "learning_rate": 1.6527762534411888e-05, + "loss": 1.0422, + "step": 5950 + }, + { + "epoch": 1.8322372216453213, + "grad_norm": 1.4697010517120361, + "learning_rate": 1.6489880050674767e-05, + "loss": 1.184, + "step": 5955 + }, + { + "epoch": 1.8337756240144611, + "grad_norm": 1.2767016887664795, + "learning_rate": 1.6452019655431828e-05, + "loss": 1.1303, + "step": 5960 + }, + { + "epoch": 1.8353140263836005, + "grad_norm": 1.345672607421875, + "learning_rate": 1.641418144695185e-05, + "loss": 1.0967, + "step": 5965 + }, + { + "epoch": 1.8368524287527404, + "grad_norm": 0.7754538059234619, + "learning_rate": 1.637636552344604e-05, + "loss": 1.1405, + "step": 5970 + }, + { + "epoch": 1.83839083112188, + "grad_norm": 1.5453096628189087, + "learning_rate": 1.6338571983067754e-05, + "loss": 1.0661, + "step": 5975 + }, + { + "epoch": 1.8399292334910196, + "grad_norm": 1.0012327432632446, + "learning_rate": 1.6300800923912224e-05, + "loss": 1.1432, + "step": 5980 + }, + { + "epoch": 1.8414676358601594, + "grad_norm": 0.9194159507751465, + "learning_rate": 1.6263052444016374e-05, + "loss": 1.1303, + "step": 5985 + }, + { + "epoch": 1.8430060382292988, + "grad_norm": 0.9644936323165894, + "learning_rate": 1.62253266413585e-05, + "loss": 1.177, + "step": 5990 + }, + { + "epoch": 1.8445444405984386, + "grad_norm": 1.5765095949172974, + "learning_rate": 1.6187623613858038e-05, + "loss": 1.1667, + "step": 5995 + }, + { + "epoch": 1.8460828429675782, + "grad_norm": 0.9766362309455872, + "learning_rate": 1.6149943459375312e-05, + "loss": 1.0883, + "step": 6000 + }, + { + "epoch": 1.8476212453367178, + "grad_norm": 1.214890480041504, + "learning_rate": 1.6112286275711298e-05, + "loss": 1.1184, + "step": 6005 + }, + { + "epoch": 1.8491596477058576, + "grad_norm": 1.128957986831665, + "learning_rate": 1.6074652160607302e-05, + "loss": 1.2097, + "step": 6010 + }, + { + "epoch": 1.850698050074997, + "grad_norm": 0.8823169469833374, + "learning_rate": 1.603704121174479e-05, + "loss": 1.1738, + "step": 6015 + }, + { + "epoch": 1.8522364524441368, + "grad_norm": 1.1140327453613281, + "learning_rate": 1.5999453526745104e-05, + "loss": 1.116, + "step": 6020 + }, + { + "epoch": 1.8537748548132764, + "grad_norm": 1.464568853378296, + "learning_rate": 1.5961889203169184e-05, + "loss": 1.1432, + "step": 6025 + }, + { + "epoch": 1.855313257182416, + "grad_norm": 1.134320616722107, + "learning_rate": 1.592434833851734e-05, + "loss": 1.1429, + "step": 6030 + }, + { + "epoch": 1.8568516595515558, + "grad_norm": 1.3063602447509766, + "learning_rate": 1.5886831030229e-05, + "loss": 1.1284, + "step": 6035 + }, + { + "epoch": 1.8583900619206952, + "grad_norm": 2.385805130004883, + "learning_rate": 1.5849337375682435e-05, + "loss": 1.1124, + "step": 6040 + }, + { + "epoch": 1.859928464289835, + "grad_norm": 0.9437244534492493, + "learning_rate": 1.5811867472194535e-05, + "loss": 1.173, + "step": 6045 + }, + { + "epoch": 1.8614668666589747, + "grad_norm": 1.2181042432785034, + "learning_rate": 1.577442141702054e-05, + "loss": 1.3157, + "step": 6050 + }, + { + "epoch": 1.8630052690281143, + "grad_norm": 1.382906198501587, + "learning_rate": 1.5736999307353785e-05, + "loss": 1.0986, + "step": 6055 + }, + { + "epoch": 1.864543671397254, + "grad_norm": 1.4896674156188965, + "learning_rate": 1.5699601240325474e-05, + "loss": 1.2179, + "step": 6060 + }, + { + "epoch": 1.8660820737663935, + "grad_norm": 1.4793058633804321, + "learning_rate": 1.5662227313004364e-05, + "loss": 1.0728, + "step": 6065 + }, + { + "epoch": 1.8676204761355333, + "grad_norm": 1.6949820518493652, + "learning_rate": 1.5624877622396588e-05, + "loss": 1.1914, + "step": 6070 + }, + { + "epoch": 1.8691588785046729, + "grad_norm": 1.629065990447998, + "learning_rate": 1.5587552265445375e-05, + "loss": 1.2429, + "step": 6075 + }, + { + "epoch": 1.8706972808738125, + "grad_norm": 1.2364530563354492, + "learning_rate": 1.5550251339030783e-05, + "loss": 1.0742, + "step": 6080 + }, + { + "epoch": 1.8722356832429523, + "grad_norm": 0.8806670904159546, + "learning_rate": 1.5512974939969464e-05, + "loss": 1.1999, + "step": 6085 + }, + { + "epoch": 1.8737740856120917, + "grad_norm": 1.9689141511917114, + "learning_rate": 1.5475723165014393e-05, + "loss": 1.1117, + "step": 6090 + }, + { + "epoch": 1.8753124879812315, + "grad_norm": 0.9261854887008667, + "learning_rate": 1.543849611085465e-05, + "loss": 1.0828, + "step": 6095 + }, + { + "epoch": 1.8768508903503711, + "grad_norm": 1.5326859951019287, + "learning_rate": 1.5401293874115147e-05, + "loss": 1.0189, + "step": 6100 + }, + { + "epoch": 1.8783892927195107, + "grad_norm": 1.155781865119934, + "learning_rate": 1.5364116551356376e-05, + "loss": 1.0945, + "step": 6105 + }, + { + "epoch": 1.8799276950886505, + "grad_norm": 1.5384670495986938, + "learning_rate": 1.532696423907416e-05, + "loss": 1.205, + "step": 6110 + }, + { + "epoch": 1.88146609745779, + "grad_norm": 1.558389663696289, + "learning_rate": 1.528983703369943e-05, + "loss": 1.1512, + "step": 6115 + }, + { + "epoch": 1.8830044998269297, + "grad_norm": 1.9130889177322388, + "learning_rate": 1.5252735031597915e-05, + "loss": 1.1126, + "step": 6120 + }, + { + "epoch": 1.8845429021960693, + "grad_norm": 1.6255749464035034, + "learning_rate": 1.521565832906994e-05, + "loss": 1.1858, + "step": 6125 + }, + { + "epoch": 1.886081304565209, + "grad_norm": 1.8116388320922852, + "learning_rate": 1.5178607022350186e-05, + "loss": 1.0117, + "step": 6130 + }, + { + "epoch": 1.8876197069343488, + "grad_norm": 1.0656421184539795, + "learning_rate": 1.5141581207607391e-05, + "loss": 1.1256, + "step": 6135 + }, + { + "epoch": 1.8891581093034884, + "grad_norm": 1.6898198127746582, + "learning_rate": 1.5104580980944141e-05, + "loss": 1.0737, + "step": 6140 + }, + { + "epoch": 1.890696511672628, + "grad_norm": 1.3169077634811401, + "learning_rate": 1.5067606438396595e-05, + "loss": 1.1341, + "step": 6145 + }, + { + "epoch": 1.8922349140417676, + "grad_norm": 0.8919572234153748, + "learning_rate": 1.5030657675934256e-05, + "loss": 1.0732, + "step": 6150 + }, + { + "epoch": 1.8937733164109072, + "grad_norm": 2.686988353729248, + "learning_rate": 1.4993734789459718e-05, + "loss": 1.229, + "step": 6155 + }, + { + "epoch": 1.895311718780047, + "grad_norm": 1.9947311878204346, + "learning_rate": 1.4956837874808391e-05, + "loss": 1.116, + "step": 6160 + }, + { + "epoch": 1.8968501211491866, + "grad_norm": 1.5105503797531128, + "learning_rate": 1.4919967027748306e-05, + "loss": 1.1457, + "step": 6165 + }, + { + "epoch": 1.8983885235183262, + "grad_norm": 1.2878894805908203, + "learning_rate": 1.4883122343979822e-05, + "loss": 1.1197, + "step": 6170 + }, + { + "epoch": 1.8999269258874658, + "grad_norm": 1.2365055084228516, + "learning_rate": 1.4846303919135355e-05, + "loss": 1.1429, + "step": 6175 + }, + { + "epoch": 1.9014653282566054, + "grad_norm": 2.5517539978027344, + "learning_rate": 1.4809511848779217e-05, + "loss": 1.1013, + "step": 6180 + }, + { + "epoch": 1.9030037306257452, + "grad_norm": 1.3758244514465332, + "learning_rate": 1.4772746228407289e-05, + "loss": 1.0995, + "step": 6185 + }, + { + "epoch": 1.9045421329948848, + "grad_norm": 1.0151314735412598, + "learning_rate": 1.4736007153446801e-05, + "loss": 1.0569, + "step": 6190 + }, + { + "epoch": 1.9060805353640244, + "grad_norm": 0.9583563208580017, + "learning_rate": 1.4699294719256091e-05, + "loss": 1.1816, + "step": 6195 + }, + { + "epoch": 1.9076189377331643, + "grad_norm": 1.0372930765151978, + "learning_rate": 1.466260902112433e-05, + "loss": 1.1818, + "step": 6200 + }, + { + "epoch": 1.9091573401023036, + "grad_norm": 1.4027732610702515, + "learning_rate": 1.4625950154271317e-05, + "loss": 1.3359, + "step": 6205 + }, + { + "epoch": 1.9106957424714435, + "grad_norm": 1.8428670167922974, + "learning_rate": 1.4589318213847197e-05, + "loss": 1.1615, + "step": 6210 + }, + { + "epoch": 1.912234144840583, + "grad_norm": 2.2269155979156494, + "learning_rate": 1.4552713294932226e-05, + "loss": 1.1065, + "step": 6215 + }, + { + "epoch": 1.9137725472097227, + "grad_norm": 1.9540749788284302, + "learning_rate": 1.4516135492536539e-05, + "loss": 1.0822, + "step": 6220 + }, + { + "epoch": 1.9153109495788625, + "grad_norm": 1.690127968788147, + "learning_rate": 1.447958490159987e-05, + "loss": 1.1871, + "step": 6225 + }, + { + "epoch": 1.9168493519480019, + "grad_norm": 1.6063660383224487, + "learning_rate": 1.444306161699131e-05, + "loss": 1.2001, + "step": 6230 + }, + { + "epoch": 1.9183877543171417, + "grad_norm": 1.3402923345565796, + "learning_rate": 1.4406565733509126e-05, + "loss": 1.0825, + "step": 6235 + }, + { + "epoch": 1.9199261566862813, + "grad_norm": 1.2019435167312622, + "learning_rate": 1.4370097345880407e-05, + "loss": 1.2244, + "step": 6240 + }, + { + "epoch": 1.921464559055421, + "grad_norm": 0.9290767312049866, + "learning_rate": 1.433365654876091e-05, + "loss": 1.2298, + "step": 6245 + }, + { + "epoch": 1.9230029614245607, + "grad_norm": 1.7722569704055786, + "learning_rate": 1.4297243436734797e-05, + "loss": 1.2057, + "step": 6250 + }, + { + "epoch": 1.9245413637937, + "grad_norm": 1.9733870029449463, + "learning_rate": 1.4260858104314297e-05, + "loss": 1.1947, + "step": 6255 + }, + { + "epoch": 1.92607976616284, + "grad_norm": 1.5969133377075195, + "learning_rate": 1.422450064593961e-05, + "loss": 1.1204, + "step": 6260 + }, + { + "epoch": 1.9276181685319795, + "grad_norm": 1.4854793548583984, + "learning_rate": 1.4188171155978566e-05, + "loss": 1.2412, + "step": 6265 + }, + { + "epoch": 1.9291565709011191, + "grad_norm": 1.1663974523544312, + "learning_rate": 1.4151869728726378e-05, + "loss": 1.2189, + "step": 6270 + }, + { + "epoch": 1.930694973270259, + "grad_norm": 1.081315279006958, + "learning_rate": 1.4115596458405459e-05, + "loss": 1.2044, + "step": 6275 + }, + { + "epoch": 1.9322333756393983, + "grad_norm": 1.7983838319778442, + "learning_rate": 1.4079351439165106e-05, + "loss": 1.187, + "step": 6280 + }, + { + "epoch": 1.9337717780085382, + "grad_norm": 1.329074740409851, + "learning_rate": 1.4043134765081297e-05, + "loss": 1.1532, + "step": 6285 + }, + { + "epoch": 1.9353101803776778, + "grad_norm": 1.1615062952041626, + "learning_rate": 1.4006946530156462e-05, + "loss": 1.1056, + "step": 6290 + }, + { + "epoch": 1.9368485827468174, + "grad_norm": 1.0058648586273193, + "learning_rate": 1.397078682831917e-05, + "loss": 1.1568, + "step": 6295 + }, + { + "epoch": 1.9383869851159572, + "grad_norm": 1.2599167823791504, + "learning_rate": 1.3934655753423976e-05, + "loss": 1.1126, + "step": 6300 + }, + { + "epoch": 1.9399253874850966, + "grad_norm": 1.3756744861602783, + "learning_rate": 1.389855339925113e-05, + "loss": 1.0468, + "step": 6305 + }, + { + "epoch": 1.9414637898542364, + "grad_norm": 1.7275495529174805, + "learning_rate": 1.386247985950628e-05, + "loss": 1.1714, + "step": 6310 + }, + { + "epoch": 1.943002192223376, + "grad_norm": 1.5819746255874634, + "learning_rate": 1.3826435227820344e-05, + "loss": 1.1858, + "step": 6315 + }, + { + "epoch": 1.9445405945925156, + "grad_norm": 1.3510098457336426, + "learning_rate": 1.3790419597749199e-05, + "loss": 1.1779, + "step": 6320 + }, + { + "epoch": 1.9460789969616554, + "grad_norm": 1.3834389448165894, + "learning_rate": 1.3754433062773409e-05, + "loss": 1.1468, + "step": 6325 + }, + { + "epoch": 1.9476173993307948, + "grad_norm": 1.2175872325897217, + "learning_rate": 1.3718475716298073e-05, + "loss": 1.1402, + "step": 6330 + }, + { + "epoch": 1.9491558016999346, + "grad_norm": 0.9370445609092712, + "learning_rate": 1.368254765165249e-05, + "loss": 1.0989, + "step": 6335 + }, + { + "epoch": 1.9506942040690742, + "grad_norm": 2.2398204803466797, + "learning_rate": 1.3646648962089965e-05, + "loss": 1.17, + "step": 6340 + }, + { + "epoch": 1.9522326064382138, + "grad_norm": 1.014602780342102, + "learning_rate": 1.3610779740787571e-05, + "loss": 1.1466, + "step": 6345 + }, + { + "epoch": 1.9537710088073537, + "grad_norm": 1.669202446937561, + "learning_rate": 1.3574940080845875e-05, + "loss": 1.2066, + "step": 6350 + }, + { + "epoch": 1.9553094111764933, + "grad_norm": 1.477262258529663, + "learning_rate": 1.3539130075288731e-05, + "loss": 1.0777, + "step": 6355 + }, + { + "epoch": 1.9568478135456329, + "grad_norm": 2.8099093437194824, + "learning_rate": 1.3503349817063047e-05, + "loss": 1.2807, + "step": 6360 + }, + { + "epoch": 1.9583862159147725, + "grad_norm": 1.3674906492233276, + "learning_rate": 1.3467599399038445e-05, + "loss": 1.1896, + "step": 6365 + }, + { + "epoch": 1.959924618283912, + "grad_norm": 1.12281334400177, + "learning_rate": 1.3431878914007167e-05, + "loss": 1.1885, + "step": 6370 + }, + { + "epoch": 1.961463020653052, + "grad_norm": 1.3317580223083496, + "learning_rate": 1.3396188454683745e-05, + "loss": 1.1905, + "step": 6375 + }, + { + "epoch": 1.9630014230221915, + "grad_norm": 1.4334850311279297, + "learning_rate": 1.3360528113704751e-05, + "loss": 1.1287, + "step": 6380 + }, + { + "epoch": 1.964539825391331, + "grad_norm": 1.2335008382797241, + "learning_rate": 1.332489798362862e-05, + "loss": 1.2626, + "step": 6385 + }, + { + "epoch": 1.9660782277604707, + "grad_norm": 2.188669204711914, + "learning_rate": 1.3289298156935348e-05, + "loss": 1.0611, + "step": 6390 + }, + { + "epoch": 1.9676166301296103, + "grad_norm": 2.0799977779388428, + "learning_rate": 1.3253728726026276e-05, + "loss": 1.1448, + "step": 6395 + }, + { + "epoch": 1.9691550324987501, + "grad_norm": 1.1249916553497314, + "learning_rate": 1.321818978322387e-05, + "loss": 1.0762, + "step": 6400 + }, + { + "epoch": 1.9706934348678897, + "grad_norm": 1.3204679489135742, + "learning_rate": 1.3182681420771453e-05, + "loss": 1.0773, + "step": 6405 + }, + { + "epoch": 1.9722318372370293, + "grad_norm": 1.5374983549118042, + "learning_rate": 1.3147203730832963e-05, + "loss": 1.0408, + "step": 6410 + }, + { + "epoch": 1.9737702396061692, + "grad_norm": 1.7096116542816162, + "learning_rate": 1.3111756805492752e-05, + "loss": 1.1046, + "step": 6415 + }, + { + "epoch": 1.9753086419753085, + "grad_norm": 1.8964815139770508, + "learning_rate": 1.3076340736755293e-05, + "loss": 1.1332, + "step": 6420 + }, + { + "epoch": 1.9768470443444484, + "grad_norm": 1.614093542098999, + "learning_rate": 1.304095561654498e-05, + "loss": 1.1041, + "step": 6425 + }, + { + "epoch": 1.978385446713588, + "grad_norm": 1.7104381322860718, + "learning_rate": 1.3005601536705889e-05, + "loss": 1.3129, + "step": 6430 + }, + { + "epoch": 1.9799238490827276, + "grad_norm": 1.825789451599121, + "learning_rate": 1.2970278589001505e-05, + "loss": 1.2037, + "step": 6435 + }, + { + "epoch": 1.9814622514518674, + "grad_norm": 0.9708835482597351, + "learning_rate": 1.293498686511454e-05, + "loss": 1.1983, + "step": 6440 + }, + { + "epoch": 1.9830006538210068, + "grad_norm": 1.194024920463562, + "learning_rate": 1.2899726456646635e-05, + "loss": 1.1271, + "step": 6445 + }, + { + "epoch": 1.9845390561901466, + "grad_norm": 1.3245621919631958, + "learning_rate": 1.2864497455118152e-05, + "loss": 1.1675, + "step": 6450 + }, + { + "epoch": 1.9860774585592862, + "grad_norm": 1.3149336576461792, + "learning_rate": 1.2829299951967954e-05, + "loss": 1.0849, + "step": 6455 + }, + { + "epoch": 1.9876158609284258, + "grad_norm": 1.097424030303955, + "learning_rate": 1.2794134038553141e-05, + "loss": 1.0848, + "step": 6460 + }, + { + "epoch": 1.9891542632975656, + "grad_norm": 1.0103280544281006, + "learning_rate": 1.2758999806148813e-05, + "loss": 1.1289, + "step": 6465 + }, + { + "epoch": 1.990692665666705, + "grad_norm": 1.243175983428955, + "learning_rate": 1.2723897345947828e-05, + "loss": 1.1198, + "step": 6470 + }, + { + "epoch": 1.9922310680358448, + "grad_norm": 2.570831298828125, + "learning_rate": 1.2688826749060611e-05, + "loss": 1.1318, + "step": 6475 + }, + { + "epoch": 1.9937694704049844, + "grad_norm": 1.352293610572815, + "learning_rate": 1.2653788106514852e-05, + "loss": 1.2472, + "step": 6480 + }, + { + "epoch": 1.995307872774124, + "grad_norm": 2.3871512413024902, + "learning_rate": 1.2618781509255332e-05, + "loss": 1.2089, + "step": 6485 + }, + { + "epoch": 1.9968462751432638, + "grad_norm": 0.9660901427268982, + "learning_rate": 1.2583807048143617e-05, + "loss": 1.2677, + "step": 6490 + }, + { + "epoch": 1.9983846775124032, + "grad_norm": 1.012502670288086, + "learning_rate": 1.2548864813957909e-05, + "loss": 1.214, + "step": 6495 + }, + { + "epoch": 1.999923079881543, + "grad_norm": 2.1729748249053955, + "learning_rate": 1.2513954897392727e-05, + "loss": 1.1746, + "step": 6500 + }, + { + "epoch": 2.001461482250683, + "grad_norm": 1.1099797487258911, + "learning_rate": 1.2479077389058708e-05, + "loss": 1.0032, + "step": 6505 + }, + { + "epoch": 2.0029998846198223, + "grad_norm": 1.2987159490585327, + "learning_rate": 1.2444232379482398e-05, + "loss": 1.0929, + "step": 6510 + }, + { + "epoch": 2.004538286988962, + "grad_norm": 1.4184848070144653, + "learning_rate": 1.2409419959105981e-05, + "loss": 1.1196, + "step": 6515 + }, + { + "epoch": 2.0060766893581015, + "grad_norm": 1.006828784942627, + "learning_rate": 1.237464021828704e-05, + "loss": 1.1062, + "step": 6520 + }, + { + "epoch": 2.0076150917272413, + "grad_norm": 1.3644285202026367, + "learning_rate": 1.233989324729834e-05, + "loss": 1.1301, + "step": 6525 + }, + { + "epoch": 2.009153494096381, + "grad_norm": 0.9252627491950989, + "learning_rate": 1.2305179136327608e-05, + "loss": 1.2008, + "step": 6530 + }, + { + "epoch": 2.0106918964655205, + "grad_norm": 2.844708204269409, + "learning_rate": 1.2270497975477253e-05, + "loss": 1.1736, + "step": 6535 + }, + { + "epoch": 2.0122302988346603, + "grad_norm": 1.0435899496078491, + "learning_rate": 1.2235849854764194e-05, + "loss": 1.1997, + "step": 6540 + }, + { + "epoch": 2.0137687012037997, + "grad_norm": 1.7181456089019775, + "learning_rate": 1.2201234864119554e-05, + "loss": 1.0245, + "step": 6545 + }, + { + "epoch": 2.0153071035729395, + "grad_norm": 2.6238577365875244, + "learning_rate": 1.2166653093388506e-05, + "loss": 1.0753, + "step": 6550 + }, + { + "epoch": 2.0168455059420793, + "grad_norm": 1.2975784540176392, + "learning_rate": 1.2132104632329963e-05, + "loss": 1.1651, + "step": 6555 + }, + { + "epoch": 2.0183839083112187, + "grad_norm": 1.737640380859375, + "learning_rate": 1.2097589570616394e-05, + "loss": 1.1217, + "step": 6560 + }, + { + "epoch": 2.0199223106803585, + "grad_norm": 1.2361128330230713, + "learning_rate": 1.2063107997833581e-05, + "loss": 1.1589, + "step": 6565 + }, + { + "epoch": 2.021460713049498, + "grad_norm": 1.1839301586151123, + "learning_rate": 1.2028660003480399e-05, + "loss": 1.1789, + "step": 6570 + }, + { + "epoch": 2.0229991154186377, + "grad_norm": 1.2682217359542847, + "learning_rate": 1.1994245676968538e-05, + "loss": 1.2168, + "step": 6575 + }, + { + "epoch": 2.0245375177877776, + "grad_norm": 1.4984492063522339, + "learning_rate": 1.1959865107622307e-05, + "loss": 1.0566, + "step": 6580 + }, + { + "epoch": 2.026075920156917, + "grad_norm": 1.7460529804229736, + "learning_rate": 1.1925518384678421e-05, + "loss": 1.0522, + "step": 6585 + }, + { + "epoch": 2.027614322526057, + "grad_norm": 1.5842326879501343, + "learning_rate": 1.1891205597285712e-05, + "loss": 1.0596, + "step": 6590 + }, + { + "epoch": 2.029152724895196, + "grad_norm": 1.3402278423309326, + "learning_rate": 1.1856926834504963e-05, + "loss": 1.2102, + "step": 6595 + }, + { + "epoch": 2.030691127264336, + "grad_norm": 1.3944041728973389, + "learning_rate": 1.1822682185308612e-05, + "loss": 1.1225, + "step": 6600 + }, + { + "epoch": 2.032229529633476, + "grad_norm": 1.0951045751571655, + "learning_rate": 1.1788471738580581e-05, + "loss": 1.0554, + "step": 6605 + }, + { + "epoch": 2.033767932002615, + "grad_norm": 0.8644259572029114, + "learning_rate": 1.1754295583116004e-05, + "loss": 1.1563, + "step": 6610 + }, + { + "epoch": 2.035306334371755, + "grad_norm": 0.9207926392555237, + "learning_rate": 1.1720153807620999e-05, + "loss": 1.2274, + "step": 6615 + }, + { + "epoch": 2.0368447367408944, + "grad_norm": 0.9534634947776794, + "learning_rate": 1.168604650071247e-05, + "loss": 1.1378, + "step": 6620 + }, + { + "epoch": 2.038383139110034, + "grad_norm": 1.1268069744110107, + "learning_rate": 1.1651973750917854e-05, + "loss": 1.1663, + "step": 6625 + }, + { + "epoch": 2.039921541479174, + "grad_norm": 1.331639051437378, + "learning_rate": 1.1617935646674885e-05, + "loss": 1.1084, + "step": 6630 + }, + { + "epoch": 2.0414599438483134, + "grad_norm": 1.2026249170303345, + "learning_rate": 1.1583932276331358e-05, + "loss": 1.2261, + "step": 6635 + }, + { + "epoch": 2.0429983462174532, + "grad_norm": 0.9286217093467712, + "learning_rate": 1.154996372814495e-05, + "loss": 1.0844, + "step": 6640 + }, + { + "epoch": 2.0445367485865926, + "grad_norm": 1.787848711013794, + "learning_rate": 1.1516030090282914e-05, + "loss": 1.2545, + "step": 6645 + }, + { + "epoch": 2.0460751509557324, + "grad_norm": 1.1162513494491577, + "learning_rate": 1.1482131450821937e-05, + "loss": 1.1272, + "step": 6650 + }, + { + "epoch": 2.0476135533248723, + "grad_norm": 1.1817599534988403, + "learning_rate": 1.1448267897747818e-05, + "loss": 1.256, + "step": 6655 + }, + { + "epoch": 2.0491519556940117, + "grad_norm": 1.7543742656707764, + "learning_rate": 1.1414439518955334e-05, + "loss": 1.1881, + "step": 6660 + }, + { + "epoch": 2.0506903580631515, + "grad_norm": 1.54066801071167, + "learning_rate": 1.1380646402247927e-05, + "loss": 1.1412, + "step": 6665 + }, + { + "epoch": 2.052228760432291, + "grad_norm": 1.0868725776672363, + "learning_rate": 1.1346888635337522e-05, + "loss": 1.02, + "step": 6670 + }, + { + "epoch": 2.0537671628014307, + "grad_norm": 1.416038155555725, + "learning_rate": 1.1313166305844306e-05, + "loss": 1.0248, + "step": 6675 + }, + { + "epoch": 2.0553055651705705, + "grad_norm": 1.1514451503753662, + "learning_rate": 1.1279479501296492e-05, + "loss": 1.13, + "step": 6680 + }, + { + "epoch": 2.05684396753971, + "grad_norm": 1.4153822660446167, + "learning_rate": 1.1245828309130061e-05, + "loss": 1.121, + "step": 6685 + }, + { + "epoch": 2.0583823699088497, + "grad_norm": 1.6066848039627075, + "learning_rate": 1.1212212816688558e-05, + "loss": 1.0533, + "step": 6690 + }, + { + "epoch": 2.059920772277989, + "grad_norm": 1.7452136278152466, + "learning_rate": 1.1178633111222909e-05, + "loss": 1.1413, + "step": 6695 + }, + { + "epoch": 2.061459174647129, + "grad_norm": 0.9885658025741577, + "learning_rate": 1.1145089279891102e-05, + "loss": 1.2047, + "step": 6700 + }, + { + "epoch": 2.0629975770162687, + "grad_norm": 4.310669898986816, + "learning_rate": 1.1111581409758043e-05, + "loss": 1.1422, + "step": 6705 + }, + { + "epoch": 2.064535979385408, + "grad_norm": 1.114450454711914, + "learning_rate": 1.107810958779531e-05, + "loss": 1.1967, + "step": 6710 + }, + { + "epoch": 2.066074381754548, + "grad_norm": 1.012830376625061, + "learning_rate": 1.1044673900880858e-05, + "loss": 1.1861, + "step": 6715 + }, + { + "epoch": 2.0676127841236878, + "grad_norm": 1.9352067708969116, + "learning_rate": 1.101127443579891e-05, + "loss": 1.1204, + "step": 6720 + }, + { + "epoch": 2.069151186492827, + "grad_norm": 1.8013137578964233, + "learning_rate": 1.0977911279239663e-05, + "loss": 1.1936, + "step": 6725 + }, + { + "epoch": 2.070689588861967, + "grad_norm": 1.516996145248413, + "learning_rate": 1.0944584517799045e-05, + "loss": 1.1004, + "step": 6730 + }, + { + "epoch": 2.0722279912311063, + "grad_norm": 1.0563892126083374, + "learning_rate": 1.091129423797855e-05, + "loss": 1.1286, + "step": 6735 + }, + { + "epoch": 2.073766393600246, + "grad_norm": 1.5256750583648682, + "learning_rate": 1.0878040526184965e-05, + "loss": 1.1727, + "step": 6740 + }, + { + "epoch": 2.075304795969386, + "grad_norm": 1.3976362943649292, + "learning_rate": 1.0844823468730158e-05, + "loss": 1.1221, + "step": 6745 + }, + { + "epoch": 2.0768431983385254, + "grad_norm": 1.1188265085220337, + "learning_rate": 1.081164315183088e-05, + "loss": 1.1319, + "step": 6750 + }, + { + "epoch": 2.078381600707665, + "grad_norm": 2.3145363330841064, + "learning_rate": 1.0778499661608491e-05, + "loss": 1.1622, + "step": 6755 + }, + { + "epoch": 2.0799200030768046, + "grad_norm": 2.635695457458496, + "learning_rate": 1.0745393084088789e-05, + "loss": 1.2318, + "step": 6760 + }, + { + "epoch": 2.0814584054459444, + "grad_norm": 1.1798350811004639, + "learning_rate": 1.0712323505201773e-05, + "loss": 0.9846, + "step": 6765 + }, + { + "epoch": 2.0829968078150842, + "grad_norm": 1.4065446853637695, + "learning_rate": 1.0679291010781362e-05, + "loss": 1.1356, + "step": 6770 + }, + { + "epoch": 2.0845352101842236, + "grad_norm": 1.7471407651901245, + "learning_rate": 1.0646295686565259e-05, + "loss": 1.08, + "step": 6775 + }, + { + "epoch": 2.0860736125533634, + "grad_norm": 1.3534942865371704, + "learning_rate": 1.0613337618194691e-05, + "loss": 1.1235, + "step": 6780 + }, + { + "epoch": 2.087612014922503, + "grad_norm": 0.9599131941795349, + "learning_rate": 1.0580416891214162e-05, + "loss": 1.1837, + "step": 6785 + }, + { + "epoch": 2.0891504172916426, + "grad_norm": 1.544646143913269, + "learning_rate": 1.0547533591071285e-05, + "loss": 1.2486, + "step": 6790 + }, + { + "epoch": 2.0906888196607825, + "grad_norm": 1.2420892715454102, + "learning_rate": 1.0514687803116499e-05, + "loss": 1.2542, + "step": 6795 + }, + { + "epoch": 2.092227222029922, + "grad_norm": 1.1411772966384888, + "learning_rate": 1.0481879612602882e-05, + "loss": 1.1346, + "step": 6800 + }, + { + "epoch": 2.0937656243990617, + "grad_norm": 1.7670328617095947, + "learning_rate": 1.0449109104685958e-05, + "loss": 1.0802, + "step": 6805 + }, + { + "epoch": 2.095304026768201, + "grad_norm": 2.038987636566162, + "learning_rate": 1.0416376364423396e-05, + "loss": 1.1316, + "step": 6810 + }, + { + "epoch": 2.096842429137341, + "grad_norm": 1.5804378986358643, + "learning_rate": 1.0383681476774876e-05, + "loss": 1.172, + "step": 6815 + }, + { + "epoch": 2.0983808315064807, + "grad_norm": 1.1744779348373413, + "learning_rate": 1.035102452660183e-05, + "loss": 1.1978, + "step": 6820 + }, + { + "epoch": 2.09991923387562, + "grad_norm": 1.132908582687378, + "learning_rate": 1.031840559866717e-05, + "loss": 1.1058, + "step": 6825 + }, + { + "epoch": 2.10145763624476, + "grad_norm": 1.5223757028579712, + "learning_rate": 1.0285824777635172e-05, + "loss": 1.0153, + "step": 6830 + }, + { + "epoch": 2.1029960386138993, + "grad_norm": 1.6681894063949585, + "learning_rate": 1.0253282148071198e-05, + "loss": 1.1409, + "step": 6835 + }, + { + "epoch": 2.104534440983039, + "grad_norm": 1.0178622007369995, + "learning_rate": 1.022077779444145e-05, + "loss": 1.3659, + "step": 6840 + }, + { + "epoch": 2.106072843352179, + "grad_norm": 1.5879106521606445, + "learning_rate": 1.0188311801112823e-05, + "loss": 1.0986, + "step": 6845 + }, + { + "epoch": 2.1076112457213183, + "grad_norm": 1.4189927577972412, + "learning_rate": 1.0155884252352616e-05, + "loss": 1.1785, + "step": 6850 + }, + { + "epoch": 2.109149648090458, + "grad_norm": 1.4003583192825317, + "learning_rate": 1.0123495232328342e-05, + "loss": 1.1677, + "step": 6855 + }, + { + "epoch": 2.1106880504595975, + "grad_norm": 1.5137903690338135, + "learning_rate": 1.009114482510754e-05, + "loss": 1.0557, + "step": 6860 + }, + { + "epoch": 2.1122264528287373, + "grad_norm": 1.2828454971313477, + "learning_rate": 1.0058833114657493e-05, + "loss": 0.9958, + "step": 6865 + }, + { + "epoch": 2.113764855197877, + "grad_norm": 1.0032036304473877, + "learning_rate": 1.0026560184845066e-05, + "loss": 1.0089, + "step": 6870 + }, + { + "epoch": 2.1153032575670165, + "grad_norm": 1.6723458766937256, + "learning_rate": 9.994326119436478e-06, + "loss": 1.0826, + "step": 6875 + }, + { + "epoch": 2.1168416599361564, + "grad_norm": 1.6122030019760132, + "learning_rate": 9.962131002097022e-06, + "loss": 1.0821, + "step": 6880 + }, + { + "epoch": 2.1183800623052957, + "grad_norm": 1.042712926864624, + "learning_rate": 9.929974916390953e-06, + "loss": 1.0976, + "step": 6885 + }, + { + "epoch": 2.1199184646744356, + "grad_norm": 1.353058099746704, + "learning_rate": 9.897857945781196e-06, + "loss": 1.2036, + "step": 6890 + }, + { + "epoch": 2.1214568670435754, + "grad_norm": 1.3776549100875854, + "learning_rate": 9.865780173629147e-06, + "loss": 1.1875, + "step": 6895 + }, + { + "epoch": 2.1229952694127148, + "grad_norm": 1.416298747062683, + "learning_rate": 9.833741683194475e-06, + "loss": 1.1701, + "step": 6900 + }, + { + "epoch": 2.1245336717818546, + "grad_norm": 3.549651861190796, + "learning_rate": 9.801742557634872e-06, + "loss": 1.1895, + "step": 6905 + }, + { + "epoch": 2.1260720741509944, + "grad_norm": 1.4298040866851807, + "learning_rate": 9.76978288000586e-06, + "loss": 1.2195, + "step": 6910 + }, + { + "epoch": 2.127610476520134, + "grad_norm": 1.3986989259719849, + "learning_rate": 9.73786273326059e-06, + "loss": 1.2749, + "step": 6915 + }, + { + "epoch": 2.1291488788892736, + "grad_norm": 1.6647675037384033, + "learning_rate": 9.70598220024958e-06, + "loss": 1.1431, + "step": 6920 + }, + { + "epoch": 2.130687281258413, + "grad_norm": 1.683068037033081, + "learning_rate": 9.674141363720554e-06, + "loss": 1.1656, + "step": 6925 + }, + { + "epoch": 2.132225683627553, + "grad_norm": 1.4037578105926514, + "learning_rate": 9.642340306318203e-06, + "loss": 1.1445, + "step": 6930 + }, + { + "epoch": 2.1337640859966926, + "grad_norm": 3.2454349994659424, + "learning_rate": 9.61057911058393e-06, + "loss": 1.1231, + "step": 6935 + }, + { + "epoch": 2.135302488365832, + "grad_norm": 1.0862977504730225, + "learning_rate": 9.578857858955715e-06, + "loss": 1.0971, + "step": 6940 + }, + { + "epoch": 2.136840890734972, + "grad_norm": 1.2978265285491943, + "learning_rate": 9.547176633767857e-06, + "loss": 1.2674, + "step": 6945 + }, + { + "epoch": 2.1383792931041112, + "grad_norm": 1.3811675310134888, + "learning_rate": 9.515535517250737e-06, + "loss": 1.1261, + "step": 6950 + }, + { + "epoch": 2.139917695473251, + "grad_norm": 1.3552972078323364, + "learning_rate": 9.483934591530668e-06, + "loss": 0.9968, + "step": 6955 + }, + { + "epoch": 2.141456097842391, + "grad_norm": 1.5487886667251587, + "learning_rate": 9.452373938629619e-06, + "loss": 1.1008, + "step": 6960 + }, + { + "epoch": 2.1429945002115303, + "grad_norm": 1.354445219039917, + "learning_rate": 9.420853640465025e-06, + "loss": 1.11, + "step": 6965 + }, + { + "epoch": 2.14453290258067, + "grad_norm": 1.501561164855957, + "learning_rate": 9.389373778849612e-06, + "loss": 1.2466, + "step": 6970 + }, + { + "epoch": 2.1460713049498095, + "grad_norm": 1.2629624605178833, + "learning_rate": 9.357934435491106e-06, + "loss": 1.2212, + "step": 6975 + }, + { + "epoch": 2.1476097073189493, + "grad_norm": 2.1113383769989014, + "learning_rate": 9.3265356919921e-06, + "loss": 1.1319, + "step": 6980 + }, + { + "epoch": 2.149148109688089, + "grad_norm": 1.405587077140808, + "learning_rate": 9.295177629849802e-06, + "loss": 1.2019, + "step": 6985 + }, + { + "epoch": 2.1506865120572285, + "grad_norm": 1.0090694427490234, + "learning_rate": 9.26386033045582e-06, + "loss": 1.2154, + "step": 6990 + }, + { + "epoch": 2.1522249144263683, + "grad_norm": 1.7210766077041626, + "learning_rate": 9.232583875095949e-06, + "loss": 1.1205, + "step": 6995 + }, + { + "epoch": 2.1537633167955077, + "grad_norm": 3.97908616065979, + "learning_rate": 9.201348344950001e-06, + "loss": 1.0795, + "step": 7000 + }, + { + "epoch": 2.1553017191646475, + "grad_norm": 1.465118646621704, + "learning_rate": 9.170153821091537e-06, + "loss": 1.1504, + "step": 7005 + }, + { + "epoch": 2.1568401215337873, + "grad_norm": 1.2877440452575684, + "learning_rate": 9.13900038448771e-06, + "loss": 1.2082, + "step": 7010 + }, + { + "epoch": 2.1583785239029267, + "grad_norm": 1.5617296695709229, + "learning_rate": 9.107888115999002e-06, + "loss": 1.1485, + "step": 7015 + }, + { + "epoch": 2.1599169262720666, + "grad_norm": 1.6196231842041016, + "learning_rate": 9.07681709637905e-06, + "loss": 1.1636, + "step": 7020 + }, + { + "epoch": 2.161455328641206, + "grad_norm": 1.7071741819381714, + "learning_rate": 9.045787406274437e-06, + "loss": 1.2166, + "step": 7025 + }, + { + "epoch": 2.1629937310103458, + "grad_norm": 1.6276894807815552, + "learning_rate": 9.014799126224471e-06, + "loss": 1.0885, + "step": 7030 + }, + { + "epoch": 2.1645321333794856, + "grad_norm": 1.0166163444519043, + "learning_rate": 8.983852336660959e-06, + "loss": 1.203, + "step": 7035 + }, + { + "epoch": 2.166070535748625, + "grad_norm": 1.7075228691101074, + "learning_rate": 8.952947117908047e-06, + "loss": 1.0762, + "step": 7040 + }, + { + "epoch": 2.167608938117765, + "grad_norm": 1.1821060180664062, + "learning_rate": 8.922083550181959e-06, + "loss": 1.0562, + "step": 7045 + }, + { + "epoch": 2.169147340486904, + "grad_norm": 1.4123872518539429, + "learning_rate": 8.891261713590807e-06, + "loss": 1.1853, + "step": 7050 + }, + { + "epoch": 2.170685742856044, + "grad_norm": 1.1683062314987183, + "learning_rate": 8.860481688134417e-06, + "loss": 1.1117, + "step": 7055 + }, + { + "epoch": 2.172224145225184, + "grad_norm": 2.6672306060791016, + "learning_rate": 8.829743553704056e-06, + "loss": 1.1283, + "step": 7060 + }, + { + "epoch": 2.173762547594323, + "grad_norm": 1.9148149490356445, + "learning_rate": 8.799047390082296e-06, + "loss": 1.1936, + "step": 7065 + }, + { + "epoch": 2.175300949963463, + "grad_norm": 1.1535136699676514, + "learning_rate": 8.768393276942743e-06, + "loss": 1.1856, + "step": 7070 + }, + { + "epoch": 2.1768393523326024, + "grad_norm": 2.045712471008301, + "learning_rate": 8.737781293849864e-06, + "loss": 1.2375, + "step": 7075 + }, + { + "epoch": 2.178377754701742, + "grad_norm": 1.2342534065246582, + "learning_rate": 8.707211520258782e-06, + "loss": 1.186, + "step": 7080 + }, + { + "epoch": 2.179916157070882, + "grad_norm": 1.2086020708084106, + "learning_rate": 8.676684035515076e-06, + "loss": 1.1079, + "step": 7085 + }, + { + "epoch": 2.1814545594400214, + "grad_norm": 1.6078280210494995, + "learning_rate": 8.646198918854526e-06, + "loss": 1.0061, + "step": 7090 + }, + { + "epoch": 2.1829929618091612, + "grad_norm": 1.091389775276184, + "learning_rate": 8.61575624940298e-06, + "loss": 1.0278, + "step": 7095 + }, + { + "epoch": 2.1845313641783006, + "grad_norm": 1.2044703960418701, + "learning_rate": 8.585356106176094e-06, + "loss": 1.1275, + "step": 7100 + }, + { + "epoch": 2.1860697665474405, + "grad_norm": 1.288192629814148, + "learning_rate": 8.55499856807913e-06, + "loss": 1.0621, + "step": 7105 + }, + { + "epoch": 2.1876081689165803, + "grad_norm": 1.0985409021377563, + "learning_rate": 8.524683713906805e-06, + "loss": 1.0334, + "step": 7110 + }, + { + "epoch": 2.1891465712857197, + "grad_norm": 1.8316186666488647, + "learning_rate": 8.49441162234301e-06, + "loss": 1.1259, + "step": 7115 + }, + { + "epoch": 2.1906849736548595, + "grad_norm": 1.233383059501648, + "learning_rate": 8.464182371960668e-06, + "loss": 1.213, + "step": 7120 + }, + { + "epoch": 2.192223376023999, + "grad_norm": 1.8326971530914307, + "learning_rate": 8.433996041221492e-06, + "loss": 1.165, + "step": 7125 + }, + { + "epoch": 2.1937617783931387, + "grad_norm": 1.156266450881958, + "learning_rate": 8.403852708475792e-06, + "loss": 1.1034, + "step": 7130 + }, + { + "epoch": 2.1953001807622785, + "grad_norm": 1.1190317869186401, + "learning_rate": 8.373752451962286e-06, + "loss": 1.0357, + "step": 7135 + }, + { + "epoch": 2.196838583131418, + "grad_norm": 1.4066323041915894, + "learning_rate": 8.34369534980789e-06, + "loss": 1.0494, + "step": 7140 + }, + { + "epoch": 2.1983769855005577, + "grad_norm": 1.2038296461105347, + "learning_rate": 8.31368148002748e-06, + "loss": 1.1883, + "step": 7145 + }, + { + "epoch": 2.199915387869697, + "grad_norm": 1.0969994068145752, + "learning_rate": 8.283710920523763e-06, + "loss": 1.0538, + "step": 7150 + }, + { + "epoch": 2.201453790238837, + "grad_norm": 1.6629050970077515, + "learning_rate": 8.253783749086993e-06, + "loss": 1.1396, + "step": 7155 + }, + { + "epoch": 2.2029921926079767, + "grad_norm": 1.3252410888671875, + "learning_rate": 8.223900043394825e-06, + "loss": 1.2732, + "step": 7160 + }, + { + "epoch": 2.204530594977116, + "grad_norm": 1.4837875366210938, + "learning_rate": 8.194059881012105e-06, + "loss": 1.0282, + "step": 7165 + }, + { + "epoch": 2.206068997346256, + "grad_norm": 1.2127585411071777, + "learning_rate": 8.164263339390635e-06, + "loss": 1.067, + "step": 7170 + }, + { + "epoch": 2.2076073997153958, + "grad_norm": 1.0039737224578857, + "learning_rate": 8.13451049586903e-06, + "loss": 1.1353, + "step": 7175 + }, + { + "epoch": 2.209145802084535, + "grad_norm": 2.2187910079956055, + "learning_rate": 8.104801427672456e-06, + "loss": 1.0704, + "step": 7180 + }, + { + "epoch": 2.210684204453675, + "grad_norm": 1.5790579319000244, + "learning_rate": 8.07513621191246e-06, + "loss": 1.1664, + "step": 7185 + }, + { + "epoch": 2.2122226068228144, + "grad_norm": 0.9343917369842529, + "learning_rate": 8.045514925586784e-06, + "loss": 1.2461, + "step": 7190 + }, + { + "epoch": 2.213761009191954, + "grad_norm": 1.3252272605895996, + "learning_rate": 8.015937645579148e-06, + "loss": 1.0664, + "step": 7195 + }, + { + "epoch": 2.215299411561094, + "grad_norm": 1.1168420314788818, + "learning_rate": 7.986404448659023e-06, + "loss": 1.145, + "step": 7200 + }, + { + "epoch": 2.2168378139302334, + "grad_norm": 3.980158567428589, + "learning_rate": 7.956915411481505e-06, + "loss": 1.099, + "step": 7205 + }, + { + "epoch": 2.218376216299373, + "grad_norm": 1.854601502418518, + "learning_rate": 7.927470610587028e-06, + "loss": 1.2413, + "step": 7210 + }, + { + "epoch": 2.2199146186685126, + "grad_norm": 1.069262981414795, + "learning_rate": 7.898070122401224e-06, + "loss": 1.2822, + "step": 7215 + }, + { + "epoch": 2.2214530210376524, + "grad_norm": 1.6758251190185547, + "learning_rate": 7.868714023234727e-06, + "loss": 1.0832, + "step": 7220 + }, + { + "epoch": 2.2229914234067922, + "grad_norm": 1.4776010513305664, + "learning_rate": 7.839402389282924e-06, + "loss": 1.1282, + "step": 7225 + }, + { + "epoch": 2.2245298257759316, + "grad_norm": 1.4931747913360596, + "learning_rate": 7.810135296625818e-06, + "loss": 1.2089, + "step": 7230 + }, + { + "epoch": 2.2260682281450714, + "grad_norm": 1.7104672193527222, + "learning_rate": 7.78091282122779e-06, + "loss": 1.0814, + "step": 7235 + }, + { + "epoch": 2.227606630514211, + "grad_norm": 1.155410647392273, + "learning_rate": 7.751735038937405e-06, + "loss": 1.1818, + "step": 7240 + }, + { + "epoch": 2.2291450328833506, + "grad_norm": 1.6542633771896362, + "learning_rate": 7.722602025487243e-06, + "loss": 1.2211, + "step": 7245 + }, + { + "epoch": 2.2306834352524905, + "grad_norm": 1.7440063953399658, + "learning_rate": 7.693513856493684e-06, + "loss": 1.0336, + "step": 7250 + }, + { + "epoch": 2.23222183762163, + "grad_norm": 1.1944167613983154, + "learning_rate": 7.6644706074567e-06, + "loss": 1.1502, + "step": 7255 + }, + { + "epoch": 2.2337602399907697, + "grad_norm": 1.1156004667282104, + "learning_rate": 7.63547235375966e-06, + "loss": 1.1147, + "step": 7260 + }, + { + "epoch": 2.235298642359909, + "grad_norm": 1.6988039016723633, + "learning_rate": 7.6065191706691795e-06, + "loss": 1.1423, + "step": 7265 + }, + { + "epoch": 2.236837044729049, + "grad_norm": 1.2307072877883911, + "learning_rate": 7.577611133334858e-06, + "loss": 1.2132, + "step": 7270 + }, + { + "epoch": 2.2383754470981887, + "grad_norm": 2.0237016677856445, + "learning_rate": 7.54874831678914e-06, + "loss": 0.992, + "step": 7275 + }, + { + "epoch": 2.239913849467328, + "grad_norm": 1.5955995321273804, + "learning_rate": 7.519930795947072e-06, + "loss": 1.1199, + "step": 7280 + }, + { + "epoch": 2.241452251836468, + "grad_norm": 2.980849504470825, + "learning_rate": 7.491158645606167e-06, + "loss": 1.2796, + "step": 7285 + }, + { + "epoch": 2.2429906542056073, + "grad_norm": 1.1380635499954224, + "learning_rate": 7.462431940446135e-06, + "loss": 1.1254, + "step": 7290 + }, + { + "epoch": 2.244529056574747, + "grad_norm": 1.3550338745117188, + "learning_rate": 7.433750755028773e-06, + "loss": 1.12, + "step": 7295 + }, + { + "epoch": 2.246067458943887, + "grad_norm": 1.9419124126434326, + "learning_rate": 7.40511516379769e-06, + "loss": 1.1136, + "step": 7300 + }, + { + "epoch": 2.2476058613130263, + "grad_norm": 2.2289304733276367, + "learning_rate": 7.376525241078189e-06, + "loss": 1.0981, + "step": 7305 + }, + { + "epoch": 2.249144263682166, + "grad_norm": 1.1924974918365479, + "learning_rate": 7.347981061077011e-06, + "loss": 0.9997, + "step": 7310 + }, + { + "epoch": 2.250682666051306, + "grad_norm": 2.636819362640381, + "learning_rate": 7.319482697882168e-06, + "loss": 1.1327, + "step": 7315 + }, + { + "epoch": 2.2522210684204453, + "grad_norm": 1.150176763534546, + "learning_rate": 7.291030225462781e-06, + "loss": 1.1047, + "step": 7320 + }, + { + "epoch": 2.253759470789585, + "grad_norm": 2.1936254501342773, + "learning_rate": 7.262623717668821e-06, + "loss": 1.0588, + "step": 7325 + }, + { + "epoch": 2.2552978731587245, + "grad_norm": 1.0446133613586426, + "learning_rate": 7.2342632482309825e-06, + "loss": 1.0701, + "step": 7330 + }, + { + "epoch": 2.2568362755278644, + "grad_norm": 1.2231892347335815, + "learning_rate": 7.205948890760464e-06, + "loss": 1.134, + "step": 7335 + }, + { + "epoch": 2.258374677897004, + "grad_norm": 1.346924066543579, + "learning_rate": 7.177680718748767e-06, + "loss": 1.0544, + "step": 7340 + }, + { + "epoch": 2.2599130802661436, + "grad_norm": 1.9672859907150269, + "learning_rate": 7.149458805567505e-06, + "loss": 1.1593, + "step": 7345 + }, + { + "epoch": 2.2614514826352834, + "grad_norm": 2.7677721977233887, + "learning_rate": 7.1212832244682585e-06, + "loss": 1.183, + "step": 7350 + }, + { + "epoch": 2.2629898850044228, + "grad_norm": 1.1894586086273193, + "learning_rate": 7.093154048582313e-06, + "loss": 1.0514, + "step": 7355 + }, + { + "epoch": 2.2645282873735626, + "grad_norm": 1.667694091796875, + "learning_rate": 7.065071350920538e-06, + "loss": 1.0868, + "step": 7360 + }, + { + "epoch": 2.2660666897427024, + "grad_norm": 1.0725141763687134, + "learning_rate": 7.037035204373147e-06, + "loss": 1.154, + "step": 7365 + }, + { + "epoch": 2.267605092111842, + "grad_norm": 1.3035621643066406, + "learning_rate": 7.009045681709522e-06, + "loss": 1.135, + "step": 7370 + }, + { + "epoch": 2.2691434944809816, + "grad_norm": 1.461717963218689, + "learning_rate": 6.981102855578062e-06, + "loss": 1.0843, + "step": 7375 + }, + { + "epoch": 2.270681896850121, + "grad_norm": 1.022071361541748, + "learning_rate": 6.953206798505918e-06, + "loss": 1.0135, + "step": 7380 + }, + { + "epoch": 2.272220299219261, + "grad_norm": 1.2068042755126953, + "learning_rate": 6.925357582898886e-06, + "loss": 1.1527, + "step": 7385 + }, + { + "epoch": 2.2737587015884007, + "grad_norm": 1.2736159563064575, + "learning_rate": 6.8975552810411765e-06, + "loss": 1.1415, + "step": 7390 + }, + { + "epoch": 2.27529710395754, + "grad_norm": 1.3243377208709717, + "learning_rate": 6.869799965095214e-06, + "loss": 1.1465, + "step": 7395 + }, + { + "epoch": 2.27683550632668, + "grad_norm": 1.1751413345336914, + "learning_rate": 6.842091707101473e-06, + "loss": 1.157, + "step": 7400 + }, + { + "epoch": 2.2783739086958192, + "grad_norm": 1.1726213693618774, + "learning_rate": 6.814430578978309e-06, + "loss": 1.1401, + "step": 7405 + }, + { + "epoch": 2.279912311064959, + "grad_norm": 1.083336353302002, + "learning_rate": 6.786816652521719e-06, + "loss": 1.1267, + "step": 7410 + }, + { + "epoch": 2.281450713434099, + "grad_norm": 3.8451592922210693, + "learning_rate": 6.759249999405212e-06, + "loss": 1.1394, + "step": 7415 + }, + { + "epoch": 2.2829891158032383, + "grad_norm": 1.711990237236023, + "learning_rate": 6.73173069117958e-06, + "loss": 1.1841, + "step": 7420 + }, + { + "epoch": 2.284527518172378, + "grad_norm": 1.154262900352478, + "learning_rate": 6.704258799272722e-06, + "loss": 1.0895, + "step": 7425 + }, + { + "epoch": 2.2860659205415175, + "grad_norm": 1.3660565614700317, + "learning_rate": 6.676834394989495e-06, + "loss": 1.1304, + "step": 7430 + }, + { + "epoch": 2.2876043229106573, + "grad_norm": 1.9204599857330322, + "learning_rate": 6.649457549511459e-06, + "loss": 1.033, + "step": 7435 + }, + { + "epoch": 2.289142725279797, + "grad_norm": 1.165299654006958, + "learning_rate": 6.622128333896768e-06, + "loss": 1.0643, + "step": 7440 + }, + { + "epoch": 2.2906811276489365, + "grad_norm": 1.4679332971572876, + "learning_rate": 6.594846819079939e-06, + "loss": 1.0537, + "step": 7445 + }, + { + "epoch": 2.2922195300180763, + "grad_norm": 1.5946282148361206, + "learning_rate": 6.56761307587167e-06, + "loss": 1.146, + "step": 7450 + }, + { + "epoch": 2.2937579323872157, + "grad_norm": 1.4443016052246094, + "learning_rate": 6.540427174958661e-06, + "loss": 1.1959, + "step": 7455 + }, + { + "epoch": 2.2952963347563555, + "grad_norm": 0.886721670627594, + "learning_rate": 6.513289186903463e-06, + "loss": 1.1575, + "step": 7460 + }, + { + "epoch": 2.2968347371254954, + "grad_norm": 1.1904209852218628, + "learning_rate": 6.486199182144229e-06, + "loss": 1.0784, + "step": 7465 + }, + { + "epoch": 2.2983731394946347, + "grad_norm": 1.3473881483078003, + "learning_rate": 6.459157230994603e-06, + "loss": 1.1037, + "step": 7470 + }, + { + "epoch": 2.2999115418637746, + "grad_norm": 1.1764485836029053, + "learning_rate": 6.432163403643482e-06, + "loss": 1.0831, + "step": 7475 + }, + { + "epoch": 2.301449944232914, + "grad_norm": 1.0809344053268433, + "learning_rate": 6.405217770154853e-06, + "loss": 1.2457, + "step": 7480 + }, + { + "epoch": 2.3029883466020538, + "grad_norm": 1.5625346899032593, + "learning_rate": 6.378320400467636e-06, + "loss": 1.2338, + "step": 7485 + }, + { + "epoch": 2.3045267489711936, + "grad_norm": 0.9581547379493713, + "learning_rate": 6.3514713643954475e-06, + "loss": 1.1475, + "step": 7490 + }, + { + "epoch": 2.306065151340333, + "grad_norm": 2.769169569015503, + "learning_rate": 6.324670731626478e-06, + "loss": 1.2225, + "step": 7495 + }, + { + "epoch": 2.307603553709473, + "grad_norm": 1.2585043907165527, + "learning_rate": 6.297918571723288e-06, + "loss": 1.0701, + "step": 7500 + }, + { + "epoch": 2.309141956078612, + "grad_norm": 1.8813979625701904, + "learning_rate": 6.271214954122581e-06, + "loss": 1.1267, + "step": 7505 + }, + { + "epoch": 2.310680358447752, + "grad_norm": 1.1062171459197998, + "learning_rate": 6.244559948135109e-06, + "loss": 1.0784, + "step": 7510 + }, + { + "epoch": 2.312218760816892, + "grad_norm": 1.0726068019866943, + "learning_rate": 6.217953622945449e-06, + "loss": 1.2017, + "step": 7515 + }, + { + "epoch": 2.313757163186031, + "grad_norm": 1.0642775297164917, + "learning_rate": 6.191396047611794e-06, + "loss": 1.1995, + "step": 7520 + }, + { + "epoch": 2.315295565555171, + "grad_norm": 1.322033405303955, + "learning_rate": 6.164887291065838e-06, + "loss": 1.0586, + "step": 7525 + }, + { + "epoch": 2.3168339679243104, + "grad_norm": 1.146404504776001, + "learning_rate": 6.138427422112539e-06, + "loss": 1.1833, + "step": 7530 + }, + { + "epoch": 2.3183723702934502, + "grad_norm": 1.8734060525894165, + "learning_rate": 6.1120165094299655e-06, + "loss": 1.0803, + "step": 7535 + }, + { + "epoch": 2.31991077266259, + "grad_norm": 1.196945309638977, + "learning_rate": 6.085654621569137e-06, + "loss": 1.168, + "step": 7540 + }, + { + "epoch": 2.3214491750317294, + "grad_norm": 0.9148879647254944, + "learning_rate": 6.0593418269538045e-06, + "loss": 1.0812, + "step": 7545 + }, + { + "epoch": 2.3229875774008693, + "grad_norm": 1.2411441802978516, + "learning_rate": 6.0330781938803034e-06, + "loss": 1.0235, + "step": 7550 + }, + { + "epoch": 2.3245259797700086, + "grad_norm": 1.3500548601150513, + "learning_rate": 6.006863790517392e-06, + "loss": 1.0834, + "step": 7555 + }, + { + "epoch": 2.3260643821391485, + "grad_norm": 1.3045114278793335, + "learning_rate": 5.980698684905989e-06, + "loss": 1.0431, + "step": 7560 + }, + { + "epoch": 2.3276027845082883, + "grad_norm": 1.2521401643753052, + "learning_rate": 5.954582944959111e-06, + "loss": 1.1774, + "step": 7565 + }, + { + "epoch": 2.3291411868774277, + "grad_norm": 1.590803861618042, + "learning_rate": 5.928516638461639e-06, + "loss": 1.2343, + "step": 7570 + }, + { + "epoch": 2.3306795892465675, + "grad_norm": 1.66427481174469, + "learning_rate": 5.902499833070119e-06, + "loss": 1.0343, + "step": 7575 + }, + { + "epoch": 2.332217991615707, + "grad_norm": 1.8136968612670898, + "learning_rate": 5.876532596312645e-06, + "loss": 1.2398, + "step": 7580 + }, + { + "epoch": 2.3337563939848467, + "grad_norm": 1.165752649307251, + "learning_rate": 5.850614995588627e-06, + "loss": 0.9754, + "step": 7585 + }, + { + "epoch": 2.3352947963539865, + "grad_norm": 2.509021043777466, + "learning_rate": 5.824747098168651e-06, + "loss": 1.0585, + "step": 7590 + }, + { + "epoch": 2.336833198723126, + "grad_norm": 1.7842833995819092, + "learning_rate": 5.798928971194301e-06, + "loss": 1.0558, + "step": 7595 + }, + { + "epoch": 2.3383716010922657, + "grad_norm": 1.0677943229675293, + "learning_rate": 5.773160681677983e-06, + "loss": 1.2275, + "step": 7600 + }, + { + "epoch": 2.339910003461405, + "grad_norm": 1.1144322156906128, + "learning_rate": 5.747442296502725e-06, + "loss": 1.3907, + "step": 7605 + }, + { + "epoch": 2.341448405830545, + "grad_norm": 2.3197555541992188, + "learning_rate": 5.721773882422057e-06, + "loss": 1.2227, + "step": 7610 + }, + { + "epoch": 2.3429868081996847, + "grad_norm": 1.0514782667160034, + "learning_rate": 5.69615550605978e-06, + "loss": 1.2214, + "step": 7615 + }, + { + "epoch": 2.344525210568824, + "grad_norm": 1.129257082939148, + "learning_rate": 5.6705872339098186e-06, + "loss": 1.1048, + "step": 7620 + }, + { + "epoch": 2.346063612937964, + "grad_norm": 2.5535309314727783, + "learning_rate": 5.645069132336078e-06, + "loss": 1.0361, + "step": 7625 + }, + { + "epoch": 2.3476020153071033, + "grad_norm": 1.601874828338623, + "learning_rate": 5.6196012675722055e-06, + "loss": 1.1273, + "step": 7630 + }, + { + "epoch": 2.349140417676243, + "grad_norm": 1.2392939329147339, + "learning_rate": 5.594183705721484e-06, + "loss": 0.9944, + "step": 7635 + }, + { + "epoch": 2.350678820045383, + "grad_norm": 1.375126600265503, + "learning_rate": 5.568816512756633e-06, + "loss": 1.1266, + "step": 7640 + }, + { + "epoch": 2.3522172224145224, + "grad_norm": 1.151104211807251, + "learning_rate": 5.5434997545196015e-06, + "loss": 1.2082, + "step": 7645 + }, + { + "epoch": 2.353755624783662, + "grad_norm": 1.6542187929153442, + "learning_rate": 5.5182334967214725e-06, + "loss": 1.1022, + "step": 7650 + }, + { + "epoch": 2.355294027152802, + "grad_norm": 3.7500414848327637, + "learning_rate": 5.493017804942238e-06, + "loss": 1.2594, + "step": 7655 + }, + { + "epoch": 2.3568324295219414, + "grad_norm": 0.9335038661956787, + "learning_rate": 5.467852744630633e-06, + "loss": 1.0596, + "step": 7660 + }, + { + "epoch": 2.358370831891081, + "grad_norm": 1.5884743928909302, + "learning_rate": 5.4427383811039985e-06, + "loss": 1.1742, + "step": 7665 + }, + { + "epoch": 2.3599092342602206, + "grad_norm": 1.1601238250732422, + "learning_rate": 5.417674779548062e-06, + "loss": 1.2038, + "step": 7670 + }, + { + "epoch": 2.3614476366293604, + "grad_norm": 1.1803358793258667, + "learning_rate": 5.39266200501681e-06, + "loss": 1.1756, + "step": 7675 + }, + { + "epoch": 2.3629860389985002, + "grad_norm": 1.501497507095337, + "learning_rate": 5.367700122432315e-06, + "loss": 1.1091, + "step": 7680 + }, + { + "epoch": 2.3645244413676396, + "grad_norm": 1.0719640254974365, + "learning_rate": 5.342789196584527e-06, + "loss": 1.1683, + "step": 7685 + }, + { + "epoch": 2.3660628437367794, + "grad_norm": 1.4727404117584229, + "learning_rate": 5.317929292131163e-06, + "loss": 1.1414, + "step": 7690 + }, + { + "epoch": 2.367601246105919, + "grad_norm": 1.195420742034912, + "learning_rate": 5.293120473597515e-06, + "loss": 1.1998, + "step": 7695 + }, + { + "epoch": 2.3691396484750586, + "grad_norm": 1.4516286849975586, + "learning_rate": 5.268362805376237e-06, + "loss": 1.1199, + "step": 7700 + }, + { + "epoch": 2.3706780508441985, + "grad_norm": 1.0601075887680054, + "learning_rate": 5.243656351727258e-06, + "loss": 1.1662, + "step": 7705 + }, + { + "epoch": 2.372216453213338, + "grad_norm": 1.7174220085144043, + "learning_rate": 5.219001176777574e-06, + "loss": 1.1581, + "step": 7710 + }, + { + "epoch": 2.3737548555824777, + "grad_norm": 1.3260307312011719, + "learning_rate": 5.194397344521065e-06, + "loss": 1.0319, + "step": 7715 + }, + { + "epoch": 2.375293257951617, + "grad_norm": 1.4584814310073853, + "learning_rate": 5.16984491881837e-06, + "loss": 1.1525, + "step": 7720 + }, + { + "epoch": 2.376831660320757, + "grad_norm": 1.018962025642395, + "learning_rate": 5.145343963396682e-06, + "loss": 1.2017, + "step": 7725 + }, + { + "epoch": 2.3783700626898967, + "grad_norm": 1.7327367067337036, + "learning_rate": 5.120894541849599e-06, + "loss": 1.1709, + "step": 7730 + }, + { + "epoch": 2.379908465059036, + "grad_norm": 1.5583730936050415, + "learning_rate": 5.096496717636984e-06, + "loss": 1.2094, + "step": 7735 + }, + { + "epoch": 2.381446867428176, + "grad_norm": 1.2519557476043701, + "learning_rate": 5.072150554084745e-06, + "loss": 1.0785, + "step": 7740 + }, + { + "epoch": 2.3829852697973157, + "grad_norm": 1.1944867372512817, + "learning_rate": 5.04785611438473e-06, + "loss": 1.2777, + "step": 7745 + }, + { + "epoch": 2.384523672166455, + "grad_norm": 1.240370512008667, + "learning_rate": 5.023613461594512e-06, + "loss": 1.2303, + "step": 7750 + }, + { + "epoch": 2.386062074535595, + "grad_norm": 1.117368459701538, + "learning_rate": 4.999422658637254e-06, + "loss": 1.0655, + "step": 7755 + }, + { + "epoch": 2.3876004769047343, + "grad_norm": 1.7489937543869019, + "learning_rate": 4.9752837683015505e-06, + "loss": 1.203, + "step": 7760 + }, + { + "epoch": 2.389138879273874, + "grad_norm": 1.187821388244629, + "learning_rate": 4.95119685324125e-06, + "loss": 1.1294, + "step": 7765 + }, + { + "epoch": 2.390677281643014, + "grad_norm": 1.0779712200164795, + "learning_rate": 4.927161975975284e-06, + "loss": 1.1303, + "step": 7770 + }, + { + "epoch": 2.3922156840121533, + "grad_norm": 1.066462516784668, + "learning_rate": 4.903179198887536e-06, + "loss": 1.2634, + "step": 7775 + }, + { + "epoch": 2.393754086381293, + "grad_norm": 1.1906163692474365, + "learning_rate": 4.879248584226645e-06, + "loss": 1.0123, + "step": 7780 + }, + { + "epoch": 2.3952924887504325, + "grad_norm": 1.7448662519454956, + "learning_rate": 4.85537019410586e-06, + "loss": 1.165, + "step": 7785 + }, + { + "epoch": 2.3968308911195724, + "grad_norm": 1.0964466333389282, + "learning_rate": 4.831544090502896e-06, + "loss": 1.1348, + "step": 7790 + }, + { + "epoch": 2.398369293488712, + "grad_norm": 1.2719080448150635, + "learning_rate": 4.807770335259726e-06, + "loss": 1.0279, + "step": 7795 + }, + { + "epoch": 2.3999076958578516, + "grad_norm": 1.6911888122558594, + "learning_rate": 4.784048990082484e-06, + "loss": 1.2083, + "step": 7800 + }, + { + "epoch": 2.4014460982269914, + "grad_norm": 1.3245162963867188, + "learning_rate": 4.760380116541246e-06, + "loss": 1.0125, + "step": 7805 + }, + { + "epoch": 2.402984500596131, + "grad_norm": 1.16487455368042, + "learning_rate": 4.736763776069897e-06, + "loss": 1.1643, + "step": 7810 + }, + { + "epoch": 2.4045229029652706, + "grad_norm": 1.4798593521118164, + "learning_rate": 4.713200029965978e-06, + "loss": 1.1108, + "step": 7815 + }, + { + "epoch": 2.4060613053344104, + "grad_norm": 1.7683199644088745, + "learning_rate": 4.689688939390521e-06, + "loss": 1.1573, + "step": 7820 + }, + { + "epoch": 2.40759970770355, + "grad_norm": 1.2997316122055054, + "learning_rate": 4.666230565367874e-06, + "loss": 1.132, + "step": 7825 + }, + { + "epoch": 2.4091381100726896, + "grad_norm": 1.1322225332260132, + "learning_rate": 4.642824968785572e-06, + "loss": 1.1534, + "step": 7830 + }, + { + "epoch": 2.410676512441829, + "grad_norm": 1.4464402198791504, + "learning_rate": 4.619472210394154e-06, + "loss": 1.1766, + "step": 7835 + }, + { + "epoch": 2.412214914810969, + "grad_norm": 2.9878416061401367, + "learning_rate": 4.596172350807004e-06, + "loss": 1.1984, + "step": 7840 + }, + { + "epoch": 2.4137533171801087, + "grad_norm": 1.0388379096984863, + "learning_rate": 4.572925450500232e-06, + "loss": 1.108, + "step": 7845 + }, + { + "epoch": 2.415291719549248, + "grad_norm": 1.252841591835022, + "learning_rate": 4.549731569812457e-06, + "loss": 1.1951, + "step": 7850 + }, + { + "epoch": 2.416830121918388, + "grad_norm": 1.5610461235046387, + "learning_rate": 4.526590768944713e-06, + "loss": 1.0645, + "step": 7855 + }, + { + "epoch": 2.4183685242875272, + "grad_norm": 4.270869255065918, + "learning_rate": 4.5035031079602445e-06, + "loss": 1.1204, + "step": 7860 + }, + { + "epoch": 2.419906926656667, + "grad_norm": 1.0736466646194458, + "learning_rate": 4.480468646784364e-06, + "loss": 1.0063, + "step": 7865 + }, + { + "epoch": 2.421445329025807, + "grad_norm": 1.0907769203186035, + "learning_rate": 4.457487445204311e-06, + "loss": 1.1522, + "step": 7870 + }, + { + "epoch": 2.4229837313949463, + "grad_norm": 2.1617796421051025, + "learning_rate": 4.434559562869098e-06, + "loss": 1.2111, + "step": 7875 + }, + { + "epoch": 2.424522133764086, + "grad_norm": 1.529151201248169, + "learning_rate": 4.411685059289314e-06, + "loss": 1.1406, + "step": 7880 + }, + { + "epoch": 2.4260605361332255, + "grad_norm": 2.3434321880340576, + "learning_rate": 4.388863993837031e-06, + "loss": 1.1284, + "step": 7885 + }, + { + "epoch": 2.4275989385023653, + "grad_norm": 1.4387180805206299, + "learning_rate": 4.366096425745597e-06, + "loss": 1.2728, + "step": 7890 + }, + { + "epoch": 2.429137340871505, + "grad_norm": 1.0508075952529907, + "learning_rate": 4.343382414109512e-06, + "loss": 1.2069, + "step": 7895 + }, + { + "epoch": 2.4306757432406445, + "grad_norm": 1.2803845405578613, + "learning_rate": 4.320722017884274e-06, + "loss": 1.1109, + "step": 7900 + }, + { + "epoch": 2.4322141456097843, + "grad_norm": 1.1172047853469849, + "learning_rate": 4.2981152958862155e-06, + "loss": 1.1645, + "step": 7905 + }, + { + "epoch": 2.4337525479789237, + "grad_norm": 1.058237075805664, + "learning_rate": 4.275562306792352e-06, + "loss": 1.1356, + "step": 7910 + }, + { + "epoch": 2.4352909503480635, + "grad_norm": 1.0330730676651, + "learning_rate": 4.253063109140224e-06, + "loss": 1.1087, + "step": 7915 + }, + { + "epoch": 2.4368293527172034, + "grad_norm": 1.2216901779174805, + "learning_rate": 4.2306177613277765e-06, + "loss": 1.0055, + "step": 7920 + }, + { + "epoch": 2.4383677550863427, + "grad_norm": 1.5149664878845215, + "learning_rate": 4.208226321613154e-06, + "loss": 1.1642, + "step": 7925 + }, + { + "epoch": 2.4399061574554826, + "grad_norm": 1.3599375486373901, + "learning_rate": 4.185888848114614e-06, + "loss": 0.9859, + "step": 7930 + }, + { + "epoch": 2.441444559824622, + "grad_norm": 1.1671425104141235, + "learning_rate": 4.163605398810305e-06, + "loss": 1.1014, + "step": 7935 + }, + { + "epoch": 2.4429829621937618, + "grad_norm": 1.6030899286270142, + "learning_rate": 4.141376031538186e-06, + "loss": 1.1734, + "step": 7940 + }, + { + "epoch": 2.4445213645629016, + "grad_norm": 1.209281086921692, + "learning_rate": 4.1192008039958235e-06, + "loss": 1.0406, + "step": 7945 + }, + { + "epoch": 2.446059766932041, + "grad_norm": 2.271026372909546, + "learning_rate": 4.097079773740256e-06, + "loss": 1.1752, + "step": 7950 + }, + { + "epoch": 2.447598169301181, + "grad_norm": 0.9315564632415771, + "learning_rate": 4.075012998187866e-06, + "loss": 1.1408, + "step": 7955 + }, + { + "epoch": 2.44913657167032, + "grad_norm": 1.7139698266983032, + "learning_rate": 4.053000534614218e-06, + "loss": 1.1057, + "step": 7960 + }, + { + "epoch": 2.45067497403946, + "grad_norm": 1.6565955877304077, + "learning_rate": 4.03104244015389e-06, + "loss": 1.1457, + "step": 7965 + }, + { + "epoch": 2.4522133764086, + "grad_norm": 1.3629543781280518, + "learning_rate": 4.0091387718003415e-06, + "loss": 1.2844, + "step": 7970 + }, + { + "epoch": 2.453751778777739, + "grad_norm": 2.2979674339294434, + "learning_rate": 3.987289586405785e-06, + "loss": 1.1268, + "step": 7975 + }, + { + "epoch": 2.455290181146879, + "grad_norm": 1.8692574501037598, + "learning_rate": 3.9654949406809995e-06, + "loss": 1.1367, + "step": 7980 + }, + { + "epoch": 2.4568285835160184, + "grad_norm": 1.706198811531067, + "learning_rate": 3.94375489119522e-06, + "loss": 1.1304, + "step": 7985 + }, + { + "epoch": 2.4583669858851582, + "grad_norm": 1.2476152181625366, + "learning_rate": 3.922069494375963e-06, + "loss": 1.1083, + "step": 7990 + }, + { + "epoch": 2.459905388254298, + "grad_norm": 1.4470798969268799, + "learning_rate": 3.900438806508885e-06, + "loss": 1.1529, + "step": 7995 + }, + { + "epoch": 2.4614437906234374, + "grad_norm": 1.186686635017395, + "learning_rate": 3.878862883737666e-06, + "loss": 1.1808, + "step": 8000 + }, + { + "epoch": 2.4629821929925773, + "grad_norm": 0.927186906337738, + "learning_rate": 3.857341782063812e-06, + "loss": 1.1583, + "step": 8005 + }, + { + "epoch": 2.4645205953617166, + "grad_norm": 1.1961028575897217, + "learning_rate": 3.835875557346552e-06, + "loss": 1.0924, + "step": 8010 + }, + { + "epoch": 2.4660589977308565, + "grad_norm": 1.1408125162124634, + "learning_rate": 3.814464265302692e-06, + "loss": 1.1786, + "step": 8015 + }, + { + "epoch": 2.4675974000999963, + "grad_norm": 1.9700957536697388, + "learning_rate": 3.7931079615064284e-06, + "loss": 1.0253, + "step": 8020 + }, + { + "epoch": 2.4691358024691357, + "grad_norm": 1.8638861179351807, + "learning_rate": 3.7718067013892465e-06, + "loss": 1.2051, + "step": 8025 + }, + { + "epoch": 2.4706742048382755, + "grad_norm": 1.2263269424438477, + "learning_rate": 3.7505605402397753e-06, + "loss": 1.1933, + "step": 8030 + }, + { + "epoch": 2.472212607207415, + "grad_norm": 1.856704831123352, + "learning_rate": 3.7293695332036027e-06, + "loss": 1.0577, + "step": 8035 + }, + { + "epoch": 2.4737510095765547, + "grad_norm": 1.062137246131897, + "learning_rate": 3.7082337352831923e-06, + "loss": 1.1723, + "step": 8040 + }, + { + "epoch": 2.4752894119456945, + "grad_norm": 2.7450308799743652, + "learning_rate": 3.6871532013376896e-06, + "loss": 1.3035, + "step": 8045 + }, + { + "epoch": 2.476827814314834, + "grad_norm": 1.344294786453247, + "learning_rate": 3.666127986082796e-06, + "loss": 1.2075, + "step": 8050 + }, + { + "epoch": 2.4783662166839737, + "grad_norm": 1.329000473022461, + "learning_rate": 3.645158144090649e-06, + "loss": 1.0978, + "step": 8055 + }, + { + "epoch": 2.479904619053113, + "grad_norm": 1.112028956413269, + "learning_rate": 3.624243729789642e-06, + "loss": 1.1551, + "step": 8060 + }, + { + "epoch": 2.481443021422253, + "grad_norm": 0.9248641133308411, + "learning_rate": 3.603384797464318e-06, + "loss": 1.0623, + "step": 8065 + }, + { + "epoch": 2.4829814237913927, + "grad_norm": 1.3773822784423828, + "learning_rate": 3.582581401255211e-06, + "loss": 1.146, + "step": 8070 + }, + { + "epoch": 2.484519826160532, + "grad_norm": 0.9391597509384155, + "learning_rate": 3.561833595158698e-06, + "loss": 1.1969, + "step": 8075 + }, + { + "epoch": 2.486058228529672, + "grad_norm": 1.0027233362197876, + "learning_rate": 3.5411414330268676e-06, + "loss": 0.9617, + "step": 8080 + }, + { + "epoch": 2.487596630898812, + "grad_norm": 1.8053239583969116, + "learning_rate": 3.5205049685674035e-06, + "loss": 1.0424, + "step": 8085 + }, + { + "epoch": 2.489135033267951, + "grad_norm": 1.6183481216430664, + "learning_rate": 3.4999242553433954e-06, + "loss": 1.0828, + "step": 8090 + }, + { + "epoch": 2.490673435637091, + "grad_norm": 1.1142714023590088, + "learning_rate": 3.4793993467732518e-06, + "loss": 1.2526, + "step": 8095 + }, + { + "epoch": 2.4922118380062304, + "grad_norm": 2.0687549114227295, + "learning_rate": 3.458930296130519e-06, + "loss": 1.0448, + "step": 8100 + }, + { + "epoch": 2.49375024037537, + "grad_norm": 1.1878471374511719, + "learning_rate": 3.4385171565437606e-06, + "loss": 1.1272, + "step": 8105 + }, + { + "epoch": 2.49528864274451, + "grad_norm": 1.824986219406128, + "learning_rate": 3.418159980996441e-06, + "loss": 1.0827, + "step": 8110 + }, + { + "epoch": 2.4968270451136494, + "grad_norm": 1.2641549110412598, + "learning_rate": 3.3978588223267383e-06, + "loss": 1.0358, + "step": 8115 + }, + { + "epoch": 2.498365447482789, + "grad_norm": 1.385815143585205, + "learning_rate": 3.3776137332274553e-06, + "loss": 1.1586, + "step": 8120 + }, + { + "epoch": 2.4999038498519286, + "grad_norm": 1.2434886693954468, + "learning_rate": 3.3574247662458645e-06, + "loss": 1.0912, + "step": 8125 + }, + { + "epoch": 2.5014422522210684, + "grad_norm": 1.060655951499939, + "learning_rate": 3.3372919737835574e-06, + "loss": 1.2092, + "step": 8130 + }, + { + "epoch": 2.502980654590208, + "grad_norm": 1.700639247894287, + "learning_rate": 3.317215408096322e-06, + "loss": 1.0629, + "step": 8135 + }, + { + "epoch": 2.5045190569593476, + "grad_norm": 1.9962120056152344, + "learning_rate": 3.297195121294022e-06, + "loss": 1.1027, + "step": 8140 + }, + { + "epoch": 2.5060574593284874, + "grad_norm": 0.9801268577575684, + "learning_rate": 3.2772311653404276e-06, + "loss": 1.2366, + "step": 8145 + }, + { + "epoch": 2.5075958616976273, + "grad_norm": 1.4176025390625, + "learning_rate": 3.257323592053116e-06, + "loss": 1.1483, + "step": 8150 + }, + { + "epoch": 2.5091342640667667, + "grad_norm": 0.9962260127067566, + "learning_rate": 3.2374724531033044e-06, + "loss": 1.0985, + "step": 8155 + }, + { + "epoch": 2.5106726664359065, + "grad_norm": 1.4499822854995728, + "learning_rate": 3.2176778000157367e-06, + "loss": 1.0903, + "step": 8160 + }, + { + "epoch": 2.512211068805046, + "grad_norm": 2.288475751876831, + "learning_rate": 3.1979396841685577e-06, + "loss": 1.1369, + "step": 8165 + }, + { + "epoch": 2.5137494711741857, + "grad_norm": 1.5029224157333374, + "learning_rate": 3.17825815679314e-06, + "loss": 1.2038, + "step": 8170 + }, + { + "epoch": 2.5152878735433255, + "grad_norm": 1.0565627813339233, + "learning_rate": 3.1586332689740037e-06, + "loss": 1.1077, + "step": 8175 + }, + { + "epoch": 2.516826275912465, + "grad_norm": 2.1373684406280518, + "learning_rate": 3.1390650716486474e-06, + "loss": 1.0097, + "step": 8180 + }, + { + "epoch": 2.5183646782816047, + "grad_norm": 1.037817358970642, + "learning_rate": 3.119553615607426e-06, + "loss": 1.0125, + "step": 8185 + }, + { + "epoch": 2.519903080650744, + "grad_norm": 1.1087840795516968, + "learning_rate": 3.1000989514934105e-06, + "loss": 1.0558, + "step": 8190 + }, + { + "epoch": 2.521441483019884, + "grad_norm": 1.8037227392196655, + "learning_rate": 3.0807011298022852e-06, + "loss": 1.1562, + "step": 8195 + }, + { + "epoch": 2.5229798853890237, + "grad_norm": 1.823003888130188, + "learning_rate": 3.061360200882174e-06, + "loss": 1.1018, + "step": 8200 + }, + { + "epoch": 2.524518287758163, + "grad_norm": 1.0306799411773682, + "learning_rate": 3.0420762149335565e-06, + "loss": 1.0944, + "step": 8205 + }, + { + "epoch": 2.526056690127303, + "grad_norm": 1.979805827140808, + "learning_rate": 3.022849222009097e-06, + "loss": 1.1025, + "step": 8210 + }, + { + "epoch": 2.5275950924964423, + "grad_norm": 1.0842509269714355, + "learning_rate": 3.0036792720135266e-06, + "loss": 1.1262, + "step": 8215 + }, + { + "epoch": 2.529133494865582, + "grad_norm": 1.5210621356964111, + "learning_rate": 2.9845664147035326e-06, + "loss": 1.1311, + "step": 8220 + }, + { + "epoch": 2.530671897234722, + "grad_norm": 1.3788422346115112, + "learning_rate": 2.965510699687615e-06, + "loss": 1.1261, + "step": 8225 + }, + { + "epoch": 2.5322102996038613, + "grad_norm": 1.1447571516036987, + "learning_rate": 2.9465121764259447e-06, + "loss": 1.2294, + "step": 8230 + }, + { + "epoch": 2.533748701973001, + "grad_norm": 1.024839162826538, + "learning_rate": 2.927570894230261e-06, + "loss": 1.1547, + "step": 8235 + }, + { + "epoch": 2.5352871043421406, + "grad_norm": 1.3132535219192505, + "learning_rate": 2.908686902263724e-06, + "loss": 1.0528, + "step": 8240 + }, + { + "epoch": 2.5368255067112804, + "grad_norm": 1.7579002380371094, + "learning_rate": 2.889860249540788e-06, + "loss": 1.2613, + "step": 8245 + }, + { + "epoch": 2.53836390908042, + "grad_norm": 2.2410359382629395, + "learning_rate": 2.8710909849270994e-06, + "loss": 1.0307, + "step": 8250 + }, + { + "epoch": 2.5399023114495596, + "grad_norm": 2.274693489074707, + "learning_rate": 2.852379157139329e-06, + "loss": 1.217, + "step": 8255 + }, + { + "epoch": 2.5414407138186994, + "grad_norm": 1.411793828010559, + "learning_rate": 2.8337248147450757e-06, + "loss": 1.0502, + "step": 8260 + }, + { + "epoch": 2.542979116187839, + "grad_norm": 1.5521130561828613, + "learning_rate": 2.815128006162751e-06, + "loss": 1.3229, + "step": 8265 + }, + { + "epoch": 2.5445175185569786, + "grad_norm": 0.9623503684997559, + "learning_rate": 2.7965887796613884e-06, + "loss": 1.1239, + "step": 8270 + }, + { + "epoch": 2.5460559209261184, + "grad_norm": 2.273772716522217, + "learning_rate": 2.7781071833606065e-06, + "loss": 1.063, + "step": 8275 + }, + { + "epoch": 2.547594323295258, + "grad_norm": 2.1022188663482666, + "learning_rate": 2.7596832652304283e-06, + "loss": 1.1397, + "step": 8280 + }, + { + "epoch": 2.5491327256643976, + "grad_norm": 1.1091395616531372, + "learning_rate": 2.7413170730911597e-06, + "loss": 1.1236, + "step": 8285 + }, + { + "epoch": 2.550671128033537, + "grad_norm": 2.22455096244812, + "learning_rate": 2.7230086546132907e-06, + "loss": 1.0664, + "step": 8290 + }, + { + "epoch": 2.552209530402677, + "grad_norm": 1.0935211181640625, + "learning_rate": 2.70475805731735e-06, + "loss": 1.1093, + "step": 8295 + }, + { + "epoch": 2.5537479327718167, + "grad_norm": 1.054491639137268, + "learning_rate": 2.6865653285737757e-06, + "loss": 1.1725, + "step": 8300 + }, + { + "epoch": 2.555286335140956, + "grad_norm": 1.5011531114578247, + "learning_rate": 2.668430515602832e-06, + "loss": 1.0883, + "step": 8305 + }, + { + "epoch": 2.556824737510096, + "grad_norm": 1.2729263305664062, + "learning_rate": 2.6503536654744338e-06, + "loss": 1.1141, + "step": 8310 + }, + { + "epoch": 2.5583631398792352, + "grad_norm": 1.7168303728103638, + "learning_rate": 2.6323348251080626e-06, + "loss": 1.1521, + "step": 8315 + }, + { + "epoch": 2.559901542248375, + "grad_norm": 1.7469420433044434, + "learning_rate": 2.6143740412726435e-06, + "loss": 1.0427, + "step": 8320 + }, + { + "epoch": 2.561439944617515, + "grad_norm": 1.154000997543335, + "learning_rate": 2.596471360586378e-06, + "loss": 1.1774, + "step": 8325 + }, + { + "epoch": 2.5629783469866543, + "grad_norm": 1.1220345497131348, + "learning_rate": 2.5786268295166892e-06, + "loss": 1.1262, + "step": 8330 + }, + { + "epoch": 2.564516749355794, + "grad_norm": 1.7687299251556396, + "learning_rate": 2.5608404943800622e-06, + "loss": 1.2676, + "step": 8335 + }, + { + "epoch": 2.5660551517249335, + "grad_norm": 1.3014109134674072, + "learning_rate": 2.5431124013419237e-06, + "loss": 1.0543, + "step": 8340 + }, + { + "epoch": 2.5675935540940733, + "grad_norm": 3.2140731811523438, + "learning_rate": 2.525442596416541e-06, + "loss": 1.1211, + "step": 8345 + }, + { + "epoch": 2.569131956463213, + "grad_norm": 1.5560851097106934, + "learning_rate": 2.5078311254668834e-06, + "loss": 1.1792, + "step": 8350 + }, + { + "epoch": 2.5706703588323525, + "grad_norm": 1.287881851196289, + "learning_rate": 2.490278034204502e-06, + "loss": 1.1749, + "step": 8355 + }, + { + "epoch": 2.5722087612014923, + "grad_norm": 1.4491333961486816, + "learning_rate": 2.4727833681894437e-06, + "loss": 1.079, + "step": 8360 + }, + { + "epoch": 2.5737471635706317, + "grad_norm": 1.628779411315918, + "learning_rate": 2.4553471728300885e-06, + "loss": 1.1926, + "step": 8365 + }, + { + "epoch": 2.5752855659397715, + "grad_norm": 1.1830757856369019, + "learning_rate": 2.4379694933830634e-06, + "loss": 1.1329, + "step": 8370 + }, + { + "epoch": 2.5768239683089114, + "grad_norm": 1.2473928928375244, + "learning_rate": 2.4206503749531236e-06, + "loss": 1.0945, + "step": 8375 + }, + { + "epoch": 2.5783623706780507, + "grad_norm": 0.9560319781303406, + "learning_rate": 2.4033898624929884e-06, + "loss": 1.2291, + "step": 8380 + }, + { + "epoch": 2.5799007730471906, + "grad_norm": 1.598518967628479, + "learning_rate": 2.386188000803302e-06, + "loss": 1.113, + "step": 8385 + }, + { + "epoch": 2.58143917541633, + "grad_norm": 3.0550365447998047, + "learning_rate": 2.3690448345324634e-06, + "loss": 1.2072, + "step": 8390 + }, + { + "epoch": 2.5829775777854698, + "grad_norm": 2.201508045196533, + "learning_rate": 2.351960408176518e-06, + "loss": 1.092, + "step": 8395 + }, + { + "epoch": 2.5845159801546096, + "grad_norm": 1.3211005926132202, + "learning_rate": 2.3349347660790582e-06, + "loss": 1.2229, + "step": 8400 + }, + { + "epoch": 2.586054382523749, + "grad_norm": 1.2909151315689087, + "learning_rate": 2.317967952431094e-06, + "loss": 1.0366, + "step": 8405 + }, + { + "epoch": 2.587592784892889, + "grad_norm": 1.7140899896621704, + "learning_rate": 2.3010600112709364e-06, + "loss": 1.1082, + "step": 8410 + }, + { + "epoch": 2.589131187262028, + "grad_norm": 1.139414668083191, + "learning_rate": 2.2842109864841034e-06, + "loss": 1.1882, + "step": 8415 + }, + { + "epoch": 2.590669589631168, + "grad_norm": 1.4526281356811523, + "learning_rate": 2.2674209218031787e-06, + "loss": 1.1233, + "step": 8420 + }, + { + "epoch": 2.592207992000308, + "grad_norm": 1.918116569519043, + "learning_rate": 2.25068986080772e-06, + "loss": 1.2028, + "step": 8425 + }, + { + "epoch": 2.593746394369447, + "grad_norm": 1.6036014556884766, + "learning_rate": 2.2340178469241467e-06, + "loss": 1.2098, + "step": 8430 + }, + { + "epoch": 2.595284796738587, + "grad_norm": 1.7327996492385864, + "learning_rate": 2.2174049234255895e-06, + "loss": 1.095, + "step": 8435 + }, + { + "epoch": 2.5968231991077264, + "grad_norm": 2.0552759170532227, + "learning_rate": 2.2008511334318306e-06, + "loss": 1.2627, + "step": 8440 + }, + { + "epoch": 2.5983616014768662, + "grad_norm": 1.393501877784729, + "learning_rate": 2.184356519909167e-06, + "loss": 1.0631, + "step": 8445 + }, + { + "epoch": 2.599900003846006, + "grad_norm": 3.89595890045166, + "learning_rate": 2.1679211256702884e-06, + "loss": 1.0974, + "step": 8450 + }, + { + "epoch": 2.6014384062151454, + "grad_norm": 1.317922592163086, + "learning_rate": 2.1515449933741854e-06, + "loss": 1.1896, + "step": 8455 + }, + { + "epoch": 2.6029768085842853, + "grad_norm": 1.5205051898956299, + "learning_rate": 2.135228165526032e-06, + "loss": 1.2268, + "step": 8460 + }, + { + "epoch": 2.6045152109534246, + "grad_norm": 1.2494088411331177, + "learning_rate": 2.118970684477062e-06, + "loss": 1.1769, + "step": 8465 + }, + { + "epoch": 2.6060536133225645, + "grad_norm": 1.1973810195922852, + "learning_rate": 2.1027725924244903e-06, + "loss": 1.1078, + "step": 8470 + }, + { + "epoch": 2.6075920156917043, + "grad_norm": 1.2799108028411865, + "learning_rate": 2.0866339314113662e-06, + "loss": 1.2161, + "step": 8475 + }, + { + "epoch": 2.6091304180608437, + "grad_norm": 1.1334717273712158, + "learning_rate": 2.0705547433264943e-06, + "loss": 1.1708, + "step": 8480 + }, + { + "epoch": 2.6106688204299835, + "grad_norm": 1.3745144605636597, + "learning_rate": 2.0545350699043174e-06, + "loss": 1.1895, + "step": 8485 + }, + { + "epoch": 2.612207222799123, + "grad_norm": 1.3669915199279785, + "learning_rate": 2.0385749527247837e-06, + "loss": 1.2637, + "step": 8490 + }, + { + "epoch": 2.6137456251682627, + "grad_norm": 1.3522690534591675, + "learning_rate": 2.0226744332132812e-06, + "loss": 1.1513, + "step": 8495 + }, + { + "epoch": 2.6152840275374025, + "grad_norm": 1.3111488819122314, + "learning_rate": 2.0068335526405023e-06, + "loss": 1.0883, + "step": 8500 + }, + { + "epoch": 2.616822429906542, + "grad_norm": 1.5023269653320312, + "learning_rate": 1.9910523521223355e-06, + "loss": 1.0835, + "step": 8505 + }, + { + "epoch": 2.6183608322756817, + "grad_norm": 1.2174749374389648, + "learning_rate": 1.975330872619782e-06, + "loss": 1.2002, + "step": 8510 + }, + { + "epoch": 2.619899234644821, + "grad_norm": 1.0910028219223022, + "learning_rate": 1.95966915493882e-06, + "loss": 1.2196, + "step": 8515 + }, + { + "epoch": 2.621437637013961, + "grad_norm": 1.2541165351867676, + "learning_rate": 1.9440672397303127e-06, + "loss": 1.0791, + "step": 8520 + }, + { + "epoch": 2.6229760393831008, + "grad_norm": 1.4798517227172852, + "learning_rate": 1.928525167489914e-06, + "loss": 1.2197, + "step": 8525 + }, + { + "epoch": 2.62451444175224, + "grad_norm": 2.8591833114624023, + "learning_rate": 1.913042978557944e-06, + "loss": 1.0815, + "step": 8530 + }, + { + "epoch": 2.62605284412138, + "grad_norm": 1.3371671438217163, + "learning_rate": 1.8976207131192914e-06, + "loss": 1.1042, + "step": 8535 + }, + { + "epoch": 2.6275912464905193, + "grad_norm": 1.2439570426940918, + "learning_rate": 1.8822584112033082e-06, + "loss": 1.0295, + "step": 8540 + }, + { + "epoch": 2.629129648859659, + "grad_norm": 1.8023666143417358, + "learning_rate": 1.8669561126837236e-06, + "loss": 1.1148, + "step": 8545 + }, + { + "epoch": 2.630668051228799, + "grad_norm": 1.4807002544403076, + "learning_rate": 1.8517138572784976e-06, + "loss": 1.1129, + "step": 8550 + }, + { + "epoch": 2.632206453597939, + "grad_norm": 1.9293781518936157, + "learning_rate": 1.836531684549772e-06, + "loss": 1.1038, + "step": 8555 + }, + { + "epoch": 2.633744855967078, + "grad_norm": 0.8880111575126648, + "learning_rate": 1.821409633903723e-06, + "loss": 1.2579, + "step": 8560 + }, + { + "epoch": 2.6352832583362176, + "grad_norm": 1.4220179319381714, + "learning_rate": 1.8063477445904835e-06, + "loss": 1.2608, + "step": 8565 + }, + { + "epoch": 2.6368216607053574, + "grad_norm": 1.7923526763916016, + "learning_rate": 1.7913460557040351e-06, + "loss": 1.212, + "step": 8570 + }, + { + "epoch": 2.638360063074497, + "grad_norm": 1.4313488006591797, + "learning_rate": 1.776404606182097e-06, + "loss": 1.1288, + "step": 8575 + }, + { + "epoch": 2.639898465443637, + "grad_norm": 1.1481549739837646, + "learning_rate": 1.7615234348060449e-06, + "loss": 1.1998, + "step": 8580 + }, + { + "epoch": 2.6414368678127764, + "grad_norm": 2.2744739055633545, + "learning_rate": 1.7467025802007987e-06, + "loss": 1.1055, + "step": 8585 + }, + { + "epoch": 2.642975270181916, + "grad_norm": 1.0382848978042603, + "learning_rate": 1.7319420808347142e-06, + "loss": 1.1111, + "step": 8590 + }, + { + "epoch": 2.6445136725510556, + "grad_norm": 1.4756346940994263, + "learning_rate": 1.717241975019493e-06, + "loss": 1.1661, + "step": 8595 + }, + { + "epoch": 2.6460520749201955, + "grad_norm": 0.9779161214828491, + "learning_rate": 1.7026023009100944e-06, + "loss": 0.9952, + "step": 8600 + }, + { + "epoch": 2.6475904772893353, + "grad_norm": 1.9879071712493896, + "learning_rate": 1.688023096504604e-06, + "loss": 1.2905, + "step": 8605 + }, + { + "epoch": 2.6491288796584747, + "grad_norm": 1.308909296989441, + "learning_rate": 1.673504399644174e-06, + "loss": 1.0405, + "step": 8610 + }, + { + "epoch": 2.6506672820276145, + "grad_norm": 1.6702406406402588, + "learning_rate": 1.6590462480128882e-06, + "loss": 1.101, + "step": 8615 + }, + { + "epoch": 2.652205684396754, + "grad_norm": 1.1122729778289795, + "learning_rate": 1.644648679137703e-06, + "loss": 1.0497, + "step": 8620 + }, + { + "epoch": 2.6537440867658937, + "grad_norm": 1.0254093408584595, + "learning_rate": 1.630311730388312e-06, + "loss": 1.2324, + "step": 8625 + }, + { + "epoch": 2.6552824891350335, + "grad_norm": 2.15950608253479, + "learning_rate": 1.6160354389770649e-06, + "loss": 1.1059, + "step": 8630 + }, + { + "epoch": 2.656820891504173, + "grad_norm": 1.1212074756622314, + "learning_rate": 1.6018198419588793e-06, + "loss": 1.1365, + "step": 8635 + }, + { + "epoch": 2.6583592938733127, + "grad_norm": 0.9412396550178528, + "learning_rate": 1.5876649762311458e-06, + "loss": 1.1834, + "step": 8640 + }, + { + "epoch": 2.659897696242452, + "grad_norm": 1.0666898488998413, + "learning_rate": 1.5735708785336033e-06, + "loss": 1.1222, + "step": 8645 + }, + { + "epoch": 2.661436098611592, + "grad_norm": 0.9774393439292908, + "learning_rate": 1.559537585448273e-06, + "loss": 1.0845, + "step": 8650 + }, + { + "epoch": 2.6629745009807317, + "grad_norm": 2.3956403732299805, + "learning_rate": 1.5455651333993626e-06, + "loss": 1.1634, + "step": 8655 + }, + { + "epoch": 2.664512903349871, + "grad_norm": 1.4635967016220093, + "learning_rate": 1.5316535586531483e-06, + "loss": 1.2237, + "step": 8660 + }, + { + "epoch": 2.666051305719011, + "grad_norm": 2.2025790214538574, + "learning_rate": 1.5178028973179104e-06, + "loss": 1.1956, + "step": 8665 + }, + { + "epoch": 2.6675897080881503, + "grad_norm": 1.1397591829299927, + "learning_rate": 1.504013185343811e-06, + "loss": 1.183, + "step": 8670 + }, + { + "epoch": 2.66912811045729, + "grad_norm": 1.7163399457931519, + "learning_rate": 1.4902844585228282e-06, + "loss": 1.0752, + "step": 8675 + }, + { + "epoch": 2.67066651282643, + "grad_norm": 1.7945340871810913, + "learning_rate": 1.476616752488641e-06, + "loss": 1.1313, + "step": 8680 + }, + { + "epoch": 2.6722049151955694, + "grad_norm": 1.072609543800354, + "learning_rate": 1.4630101027165444e-06, + "loss": 0.9894, + "step": 8685 + }, + { + "epoch": 2.673743317564709, + "grad_norm": 1.3113356828689575, + "learning_rate": 1.4494645445233658e-06, + "loss": 1.1927, + "step": 8690 + }, + { + "epoch": 2.6752817199338486, + "grad_norm": 2.284916639328003, + "learning_rate": 1.4359801130673616e-06, + "loss": 1.206, + "step": 8695 + }, + { + "epoch": 2.6768201223029884, + "grad_norm": 1.0177814960479736, + "learning_rate": 1.4225568433481329e-06, + "loss": 1.1056, + "step": 8700 + }, + { + "epoch": 2.678358524672128, + "grad_norm": 1.0384098291397095, + "learning_rate": 1.4091947702065262e-06, + "loss": 1.1469, + "step": 8705 + }, + { + "epoch": 2.6798969270412676, + "grad_norm": 2.263399362564087, + "learning_rate": 1.3958939283245543e-06, + "loss": 1.038, + "step": 8710 + }, + { + "epoch": 2.6814353294104074, + "grad_norm": 2.890347480773926, + "learning_rate": 1.382654352225296e-06, + "loss": 1.1278, + "step": 8715 + }, + { + "epoch": 2.682973731779547, + "grad_norm": 1.9222828149795532, + "learning_rate": 1.3694760762728215e-06, + "loss": 1.1225, + "step": 8720 + }, + { + "epoch": 2.6845121341486866, + "grad_norm": 1.1517970561981201, + "learning_rate": 1.3563591346720804e-06, + "loss": 1.1967, + "step": 8725 + }, + { + "epoch": 2.6860505365178264, + "grad_norm": 1.7945805788040161, + "learning_rate": 1.3433035614688338e-06, + "loss": 1.1659, + "step": 8730 + }, + { + "epoch": 2.687588938886966, + "grad_norm": 0.9292943477630615, + "learning_rate": 1.3303093905495528e-06, + "loss": 1.2293, + "step": 8735 + }, + { + "epoch": 2.6891273412561056, + "grad_norm": 1.3209861516952515, + "learning_rate": 1.3173766556413393e-06, + "loss": 1.0543, + "step": 8740 + }, + { + "epoch": 2.690665743625245, + "grad_norm": 1.1976464986801147, + "learning_rate": 1.3045053903118303e-06, + "loss": 1.0793, + "step": 8745 + }, + { + "epoch": 2.692204145994385, + "grad_norm": 1.2206952571868896, + "learning_rate": 1.2916956279691223e-06, + "loss": 1.1523, + "step": 8750 + }, + { + "epoch": 2.6937425483635247, + "grad_norm": 1.2255494594573975, + "learning_rate": 1.2789474018616714e-06, + "loss": 1.0986, + "step": 8755 + }, + { + "epoch": 2.695280950732664, + "grad_norm": 2.243601083755493, + "learning_rate": 1.26626074507821e-06, + "loss": 1.0889, + "step": 8760 + }, + { + "epoch": 2.696819353101804, + "grad_norm": 1.0789637565612793, + "learning_rate": 1.2536356905476748e-06, + "loss": 1.1933, + "step": 8765 + }, + { + "epoch": 2.6983577554709433, + "grad_norm": 1.564278483390808, + "learning_rate": 1.2410722710390954e-06, + "loss": 1.1819, + "step": 8770 + }, + { + "epoch": 2.699896157840083, + "grad_norm": 1.5282732248306274, + "learning_rate": 1.2285705191615426e-06, + "loss": 1.2118, + "step": 8775 + }, + { + "epoch": 2.701434560209223, + "grad_norm": 1.438302993774414, + "learning_rate": 1.21613046736401e-06, + "loss": 1.2306, + "step": 8780 + }, + { + "epoch": 2.7029729625783623, + "grad_norm": 1.248544454574585, + "learning_rate": 1.203752147935347e-06, + "loss": 1.2009, + "step": 8785 + }, + { + "epoch": 2.704511364947502, + "grad_norm": 1.5993555784225464, + "learning_rate": 1.1914355930041837e-06, + "loss": 1.0794, + "step": 8790 + }, + { + "epoch": 2.7060497673166415, + "grad_norm": 1.6044913530349731, + "learning_rate": 1.179180834538826e-06, + "loss": 1.1427, + "step": 8795 + }, + { + "epoch": 2.7075881696857813, + "grad_norm": 1.4509234428405762, + "learning_rate": 1.166987904347186e-06, + "loss": 1.123, + "step": 8800 + }, + { + "epoch": 2.709126572054921, + "grad_norm": 1.0035152435302734, + "learning_rate": 1.1548568340767036e-06, + "loss": 1.1457, + "step": 8805 + }, + { + "epoch": 2.7106649744240605, + "grad_norm": 1.530730962753296, + "learning_rate": 1.142787655214253e-06, + "loss": 1.1163, + "step": 8810 + }, + { + "epoch": 2.7122033767932003, + "grad_norm": 1.7669676542282104, + "learning_rate": 1.1307803990860594e-06, + "loss": 1.0995, + "step": 8815 + }, + { + "epoch": 2.7137417791623397, + "grad_norm": 1.4253056049346924, + "learning_rate": 1.1188350968576372e-06, + "loss": 1.1652, + "step": 8820 + }, + { + "epoch": 2.7152801815314795, + "grad_norm": 1.14907705783844, + "learning_rate": 1.1069517795336825e-06, + "loss": 1.1864, + "step": 8825 + }, + { + "epoch": 2.7168185839006194, + "grad_norm": 1.3076614141464233, + "learning_rate": 1.0951304779580146e-06, + "loss": 1.0647, + "step": 8830 + }, + { + "epoch": 2.7183569862697587, + "grad_norm": 1.654503583908081, + "learning_rate": 1.0833712228134952e-06, + "loss": 1.2105, + "step": 8835 + }, + { + "epoch": 2.7198953886388986, + "grad_norm": 1.136584758758545, + "learning_rate": 1.0716740446219175e-06, + "loss": 1.167, + "step": 8840 + }, + { + "epoch": 2.721433791008038, + "grad_norm": 1.5034282207489014, + "learning_rate": 1.0600389737439681e-06, + "loss": 1.1643, + "step": 8845 + }, + { + "epoch": 2.7229721933771778, + "grad_norm": 1.1271815299987793, + "learning_rate": 1.0484660403791314e-06, + "loss": 1.2436, + "step": 8850 + }, + { + "epoch": 2.7245105957463176, + "grad_norm": 1.1455885171890259, + "learning_rate": 1.0369552745656013e-06, + "loss": 1.0879, + "step": 8855 + }, + { + "epoch": 2.726048998115457, + "grad_norm": 1.2253069877624512, + "learning_rate": 1.025506706180221e-06, + "loss": 1.206, + "step": 8860 + }, + { + "epoch": 2.727587400484597, + "grad_norm": 1.2960996627807617, + "learning_rate": 1.0141203649383924e-06, + "loss": 1.2533, + "step": 8865 + }, + { + "epoch": 2.729125802853736, + "grad_norm": 1.177007794380188, + "learning_rate": 1.0027962803939944e-06, + "loss": 1.1394, + "step": 8870 + }, + { + "epoch": 2.730664205222876, + "grad_norm": 1.0711764097213745, + "learning_rate": 9.91534481939338e-07, + "loss": 1.1713, + "step": 8875 + }, + { + "epoch": 2.732202607592016, + "grad_norm": 1.1851369142532349, + "learning_rate": 9.80334998805041e-07, + "loss": 1.1108, + "step": 8880 + }, + { + "epoch": 2.733741009961155, + "grad_norm": 1.4155503511428833, + "learning_rate": 9.691978600599977e-07, + "loss": 1.191, + "step": 8885 + }, + { + "epoch": 2.735279412330295, + "grad_norm": 1.3459392786026, + "learning_rate": 9.581230946112824e-07, + "loss": 0.9247, + "step": 8890 + }, + { + "epoch": 2.7368178146994344, + "grad_norm": 1.4829347133636475, + "learning_rate": 9.471107312040567e-07, + "loss": 1.0897, + "step": 8895 + }, + { + "epoch": 2.7383562170685742, + "grad_norm": 1.6139320135116577, + "learning_rate": 9.361607984215342e-07, + "loss": 1.1201, + "step": 8900 + }, + { + "epoch": 2.739894619437714, + "grad_norm": 1.500117540359497, + "learning_rate": 9.25273324684886e-07, + "loss": 1.2296, + "step": 8905 + }, + { + "epoch": 2.7414330218068534, + "grad_norm": 0.9885631203651428, + "learning_rate": 9.144483382531571e-07, + "loss": 1.1622, + "step": 8910 + }, + { + "epoch": 2.7429714241759933, + "grad_norm": 1.3149123191833496, + "learning_rate": 9.036858672232057e-07, + "loss": 1.1528, + "step": 8915 + }, + { + "epoch": 2.7445098265451326, + "grad_norm": 1.1396979093551636, + "learning_rate": 8.929859395296364e-07, + "loss": 1.1723, + "step": 8920 + }, + { + "epoch": 2.7460482289142725, + "grad_norm": 1.0069197416305542, + "learning_rate": 8.823485829447003e-07, + "loss": 1.1311, + "step": 8925 + }, + { + "epoch": 2.7475866312834123, + "grad_norm": 1.2922357320785522, + "learning_rate": 8.717738250782675e-07, + "loss": 1.2323, + "step": 8930 + }, + { + "epoch": 2.7491250336525517, + "grad_norm": 1.268516182899475, + "learning_rate": 8.612616933777046e-07, + "loss": 1.1243, + "step": 8935 + }, + { + "epoch": 2.7506634360216915, + "grad_norm": 1.552514910697937, + "learning_rate": 8.508122151278442e-07, + "loss": 1.0926, + "step": 8940 + }, + { + "epoch": 2.752201838390831, + "grad_norm": 1.1461021900177002, + "learning_rate": 8.404254174509019e-07, + "loss": 1.1677, + "step": 8945 + }, + { + "epoch": 2.7537402407599707, + "grad_norm": 1.04787278175354, + "learning_rate": 8.301013273063791e-07, + "loss": 1.1723, + "step": 8950 + }, + { + "epoch": 2.7552786431291105, + "grad_norm": 2.725139617919922, + "learning_rate": 8.198399714910404e-07, + "loss": 1.183, + "step": 8955 + }, + { + "epoch": 2.75681704549825, + "grad_norm": 3.641418218612671, + "learning_rate": 8.096413766388117e-07, + "loss": 1.0918, + "step": 8960 + }, + { + "epoch": 2.7583554478673897, + "grad_norm": 1.5253046751022339, + "learning_rate": 7.995055692207127e-07, + "loss": 1.2304, + "step": 8965 + }, + { + "epoch": 2.759893850236529, + "grad_norm": 1.2496076822280884, + "learning_rate": 7.894325755448073e-07, + "loss": 1.1795, + "step": 8970 + }, + { + "epoch": 2.761432252605669, + "grad_norm": 1.7881182432174683, + "learning_rate": 7.794224217561152e-07, + "loss": 1.0852, + "step": 8975 + }, + { + "epoch": 2.7629706549748088, + "grad_norm": 2.052133083343506, + "learning_rate": 7.694751338365447e-07, + "loss": 1.1222, + "step": 8980 + }, + { + "epoch": 2.7645090573439486, + "grad_norm": 1.0607606172561646, + "learning_rate": 7.595907376048512e-07, + "loss": 1.0621, + "step": 8985 + }, + { + "epoch": 2.766047459713088, + "grad_norm": 1.3713186979293823, + "learning_rate": 7.497692587165345e-07, + "loss": 1.2151, + "step": 8990 + }, + { + "epoch": 2.7675858620822273, + "grad_norm": 1.383420467376709, + "learning_rate": 7.40010722663792e-07, + "loss": 1.0939, + "step": 8995 + }, + { + "epoch": 2.769124264451367, + "grad_norm": 0.9419933557510376, + "learning_rate": 7.303151547754627e-07, + "loss": 1.1962, + "step": 9000 + }, + { + "epoch": 2.770662666820507, + "grad_norm": 1.4444811344146729, + "learning_rate": 7.20682580216922e-07, + "loss": 1.1294, + "step": 9005 + }, + { + "epoch": 2.772201069189647, + "grad_norm": 1.7534838914871216, + "learning_rate": 7.111130239900677e-07, + "loss": 1.1511, + "step": 9010 + }, + { + "epoch": 2.773739471558786, + "grad_norm": 1.5162113904953003, + "learning_rate": 7.016065109332226e-07, + "loss": 1.0526, + "step": 9015 + }, + { + "epoch": 2.7752778739279256, + "grad_norm": 1.8117800951004028, + "learning_rate": 6.921630657210659e-07, + "loss": 1.0708, + "step": 9020 + }, + { + "epoch": 2.7768162762970654, + "grad_norm": 1.098140001296997, + "learning_rate": 6.827827128645992e-07, + "loss": 1.0874, + "step": 9025 + }, + { + "epoch": 2.7783546786662052, + "grad_norm": 0.9571955800056458, + "learning_rate": 6.734654767110521e-07, + "loss": 1.1287, + "step": 9030 + }, + { + "epoch": 2.779893081035345, + "grad_norm": 2.1803581714630127, + "learning_rate": 6.6421138144383e-07, + "loss": 1.2326, + "step": 9035 + }, + { + "epoch": 2.7814314834044844, + "grad_norm": 1.0323264598846436, + "learning_rate": 6.550204510824609e-07, + "loss": 1.1212, + "step": 9040 + }, + { + "epoch": 2.7829698857736243, + "grad_norm": 2.1114048957824707, + "learning_rate": 6.458927094825179e-07, + "loss": 1.2275, + "step": 9045 + }, + { + "epoch": 2.7845082881427636, + "grad_norm": 1.30685293674469, + "learning_rate": 6.368281803355691e-07, + "loss": 1.1677, + "step": 9050 + }, + { + "epoch": 2.7860466905119035, + "grad_norm": 1.824419379234314, + "learning_rate": 6.27826887169114e-07, + "loss": 1.057, + "step": 9055 + }, + { + "epoch": 2.7875850928810433, + "grad_norm": 0.9725929498672485, + "learning_rate": 6.188888533465053e-07, + "loss": 1.166, + "step": 9060 + }, + { + "epoch": 2.7891234952501827, + "grad_norm": 1.6305102109909058, + "learning_rate": 6.100141020669137e-07, + "loss": 1.166, + "step": 9065 + }, + { + "epoch": 2.7906618976193225, + "grad_norm": 1.65754234790802, + "learning_rate": 6.012026563652573e-07, + "loss": 1.2142, + "step": 9070 + }, + { + "epoch": 2.792200299988462, + "grad_norm": 1.303271770477295, + "learning_rate": 5.924545391121361e-07, + "loss": 1.2034, + "step": 9075 + }, + { + "epoch": 2.7937387023576017, + "grad_norm": 2.9017386436462402, + "learning_rate": 5.837697730137814e-07, + "loss": 1.0679, + "step": 9080 + }, + { + "epoch": 2.7952771047267415, + "grad_norm": 1.4329947233200073, + "learning_rate": 5.751483806119923e-07, + "loss": 1.2121, + "step": 9085 + }, + { + "epoch": 2.796815507095881, + "grad_norm": 0.9450810551643372, + "learning_rate": 5.665903842840714e-07, + "loss": 1.0761, + "step": 9090 + }, + { + "epoch": 2.7983539094650207, + "grad_norm": 2.1919000148773193, + "learning_rate": 5.580958062427866e-07, + "loss": 1.1133, + "step": 9095 + }, + { + "epoch": 2.79989231183416, + "grad_norm": 1.355247139930725, + "learning_rate": 5.496646685362844e-07, + "loss": 1.0562, + "step": 9100 + }, + { + "epoch": 2.8014307142033, + "grad_norm": 1.137313723564148, + "learning_rate": 5.412969930480599e-07, + "loss": 1.1331, + "step": 9105 + }, + { + "epoch": 2.8029691165724397, + "grad_norm": 3.3919472694396973, + "learning_rate": 5.329928014968843e-07, + "loss": 1.1539, + "step": 9110 + }, + { + "epoch": 2.804507518941579, + "grad_norm": 1.3040916919708252, + "learning_rate": 5.247521154367552e-07, + "loss": 1.1187, + "step": 9115 + }, + { + "epoch": 2.806045921310719, + "grad_norm": 1.5795443058013916, + "learning_rate": 5.165749562568323e-07, + "loss": 1.1518, + "step": 9120 + }, + { + "epoch": 2.8075843236798583, + "grad_norm": 1.486302137374878, + "learning_rate": 5.084613451813935e-07, + "loss": 1.0955, + "step": 9125 + }, + { + "epoch": 2.809122726048998, + "grad_norm": 1.4122854471206665, + "learning_rate": 5.00411303269771e-07, + "loss": 1.1842, + "step": 9130 + }, + { + "epoch": 2.810661128418138, + "grad_norm": 1.3084039688110352, + "learning_rate": 4.924248514163038e-07, + "loss": 1.1416, + "step": 9135 + }, + { + "epoch": 2.8121995307872774, + "grad_norm": 1.42378830909729, + "learning_rate": 4.845020103502712e-07, + "loss": 1.1191, + "step": 9140 + }, + { + "epoch": 2.813737933156417, + "grad_norm": 1.8624348640441895, + "learning_rate": 4.766428006358542e-07, + "loss": 1.021, + "step": 9145 + }, + { + "epoch": 2.8152763355255566, + "grad_norm": 1.3880897760391235, + "learning_rate": 4.688472426720714e-07, + "loss": 1.0401, + "step": 9150 + }, + { + "epoch": 2.8168147378946964, + "grad_norm": 1.7636202573776245, + "learning_rate": 4.611153566927373e-07, + "loss": 1.1469, + "step": 9155 + }, + { + "epoch": 2.818353140263836, + "grad_norm": 1.6327768564224243, + "learning_rate": 4.534471627663878e-07, + "loss": 1.1372, + "step": 9160 + }, + { + "epoch": 2.8198915426329756, + "grad_norm": 1.5780202150344849, + "learning_rate": 4.4584268079625735e-07, + "loss": 1.1779, + "step": 9165 + }, + { + "epoch": 2.8214299450021154, + "grad_norm": 1.1375956535339355, + "learning_rate": 4.3830193052020186e-07, + "loss": 1.11, + "step": 9170 + }, + { + "epoch": 2.822968347371255, + "grad_norm": 1.3757110834121704, + "learning_rate": 4.308249315106649e-07, + "loss": 1.0576, + "step": 9175 + }, + { + "epoch": 2.8245067497403946, + "grad_norm": 1.2449181079864502, + "learning_rate": 4.234117031746143e-07, + "loss": 1.1202, + "step": 9180 + }, + { + "epoch": 2.8260451521095344, + "grad_norm": 1.2886162996292114, + "learning_rate": 4.1606226475350287e-07, + "loss": 1.2508, + "step": 9185 + }, + { + "epoch": 2.827583554478674, + "grad_norm": 2.6857354640960693, + "learning_rate": 4.087766353232103e-07, + "loss": 1.1784, + "step": 9190 + }, + { + "epoch": 2.8291219568478136, + "grad_norm": 1.6899442672729492, + "learning_rate": 4.015548337939962e-07, + "loss": 1.2262, + "step": 9195 + }, + { + "epoch": 2.830660359216953, + "grad_norm": 1.3128477334976196, + "learning_rate": 3.943968789104496e-07, + "loss": 1.186, + "step": 9200 + }, + { + "epoch": 2.832198761586093, + "grad_norm": 1.329758882522583, + "learning_rate": 3.87302789251448e-07, + "loss": 1.1431, + "step": 9205 + }, + { + "epoch": 2.8337371639552327, + "grad_norm": 1.1432725191116333, + "learning_rate": 3.8027258323010127e-07, + "loss": 1.0308, + "step": 9210 + }, + { + "epoch": 2.835275566324372, + "grad_norm": 1.351478099822998, + "learning_rate": 3.733062790936964e-07, + "loss": 1.0939, + "step": 9215 + }, + { + "epoch": 2.836813968693512, + "grad_norm": 1.2677690982818604, + "learning_rate": 3.6640389492367534e-07, + "loss": 1.1051, + "step": 9220 + }, + { + "epoch": 2.8383523710626513, + "grad_norm": 2.572026014328003, + "learning_rate": 3.5956544863555983e-07, + "loss": 1.0794, + "step": 9225 + }, + { + "epoch": 2.839890773431791, + "grad_norm": 1.2987449169158936, + "learning_rate": 3.5279095797892127e-07, + "loss": 1.2363, + "step": 9230 + }, + { + "epoch": 2.841429175800931, + "grad_norm": 1.7580738067626953, + "learning_rate": 3.460804405373302e-07, + "loss": 1.0583, + "step": 9235 + }, + { + "epoch": 2.8429675781700703, + "grad_norm": 1.4407984018325806, + "learning_rate": 3.394339137283098e-07, + "loss": 1.2122, + "step": 9240 + }, + { + "epoch": 2.84450598053921, + "grad_norm": 1.2053810358047485, + "learning_rate": 3.328513948032991e-07, + "loss": 1.1871, + "step": 9245 + }, + { + "epoch": 2.8460443829083495, + "grad_norm": 1.9730299711227417, + "learning_rate": 3.263329008475924e-07, + "loss": 1.0696, + "step": 9250 + }, + { + "epoch": 2.8475827852774893, + "grad_norm": 0.805873453617096, + "learning_rate": 3.1987844878030307e-07, + "loss": 1.0279, + "step": 9255 + }, + { + "epoch": 2.849121187646629, + "grad_norm": 1.1469902992248535, + "learning_rate": 3.1348805535432735e-07, + "loss": 1.1212, + "step": 9260 + }, + { + "epoch": 2.8506595900157685, + "grad_norm": 1.4342927932739258, + "learning_rate": 3.071617371562946e-07, + "loss": 1.1438, + "step": 9265 + }, + { + "epoch": 2.8521979923849083, + "grad_norm": 1.4344321489334106, + "learning_rate": 3.0089951060651156e-07, + "loss": 1.2395, + "step": 9270 + }, + { + "epoch": 2.8537363947540477, + "grad_norm": 1.1882743835449219, + "learning_rate": 2.947013919589431e-07, + "loss": 1.0578, + "step": 9275 + }, + { + "epoch": 2.8552747971231875, + "grad_norm": 1.8451581001281738, + "learning_rate": 2.88567397301151e-07, + "loss": 1.1106, + "step": 9280 + }, + { + "epoch": 2.8568131994923274, + "grad_norm": 1.0893770456314087, + "learning_rate": 2.824975425542664e-07, + "loss": 1.0286, + "step": 9285 + }, + { + "epoch": 2.8583516018614668, + "grad_norm": 1.2297017574310303, + "learning_rate": 2.764918434729369e-07, + "loss": 1.1635, + "step": 9290 + }, + { + "epoch": 2.8598900042306066, + "grad_norm": 1.1600873470306396, + "learning_rate": 2.7055031564529043e-07, + "loss": 1.1557, + "step": 9295 + }, + { + "epoch": 2.861428406599746, + "grad_norm": 1.1898655891418457, + "learning_rate": 2.646729744928966e-07, + "loss": 1.0724, + "step": 9300 + }, + { + "epoch": 2.862966808968886, + "grad_norm": 1.7865312099456787, + "learning_rate": 2.588598352707278e-07, + "loss": 1.1252, + "step": 9305 + }, + { + "epoch": 2.8645052113380256, + "grad_norm": 1.6626031398773193, + "learning_rate": 2.531109130671061e-07, + "loss": 1.1687, + "step": 9310 + }, + { + "epoch": 2.866043613707165, + "grad_norm": 1.245299220085144, + "learning_rate": 2.474262228036872e-07, + "loss": 1.0814, + "step": 9315 + }, + { + "epoch": 2.867582016076305, + "grad_norm": 1.5507864952087402, + "learning_rate": 2.418057792354045e-07, + "loss": 1.085, + "step": 9320 + }, + { + "epoch": 2.869120418445444, + "grad_norm": 2.039008140563965, + "learning_rate": 2.3624959695043302e-07, + "loss": 1.0728, + "step": 9325 + }, + { + "epoch": 2.870658820814584, + "grad_norm": 1.1104451417922974, + "learning_rate": 2.3075769037015638e-07, + "loss": 1.1025, + "step": 9330 + }, + { + "epoch": 2.872197223183724, + "grad_norm": 1.1640777587890625, + "learning_rate": 2.2533007374912485e-07, + "loss": 1.2473, + "step": 9335 + }, + { + "epoch": 2.873735625552863, + "grad_norm": 1.0820621252059937, + "learning_rate": 2.1996676117502224e-07, + "loss": 1.1593, + "step": 9340 + }, + { + "epoch": 2.875274027922003, + "grad_norm": 1.981393575668335, + "learning_rate": 2.146677665686325e-07, + "loss": 1.2071, + "step": 9345 + }, + { + "epoch": 2.8768124302911424, + "grad_norm": 1.2725703716278076, + "learning_rate": 2.094331036837871e-07, + "loss": 1.179, + "step": 9350 + }, + { + "epoch": 2.8783508326602822, + "grad_norm": 1.1721535921096802, + "learning_rate": 2.0426278610735094e-07, + "loss": 1.1551, + "step": 9355 + }, + { + "epoch": 2.879889235029422, + "grad_norm": 1.6702649593353271, + "learning_rate": 1.9915682725917262e-07, + "loss": 0.9862, + "step": 9360 + }, + { + "epoch": 2.8814276373985614, + "grad_norm": 0.7977102994918823, + "learning_rate": 1.9411524039205376e-07, + "loss": 1.1702, + "step": 9365 + }, + { + "epoch": 2.8829660397677013, + "grad_norm": 1.0486433506011963, + "learning_rate": 1.891380385917213e-07, + "loss": 1.1837, + "step": 9370 + }, + { + "epoch": 2.8845044421368407, + "grad_norm": 1.5188802480697632, + "learning_rate": 1.842252347767748e-07, + "loss": 1.153, + "step": 9375 + }, + { + "epoch": 2.8860428445059805, + "grad_norm": 1.3773759603500366, + "learning_rate": 1.7937684169867797e-07, + "loss": 1.1127, + "step": 9380 + }, + { + "epoch": 2.8875812468751203, + "grad_norm": 1.7607942819595337, + "learning_rate": 1.7459287194170615e-07, + "loss": 1.1165, + "step": 9385 + }, + { + "epoch": 2.8891196492442597, + "grad_norm": 1.1703269481658936, + "learning_rate": 1.6987333792292115e-07, + "loss": 1.2549, + "step": 9390 + }, + { + "epoch": 2.8906580516133995, + "grad_norm": 1.1491749286651611, + "learning_rate": 1.6521825189213526e-07, + "loss": 1.1517, + "step": 9395 + }, + { + "epoch": 2.892196453982539, + "grad_norm": 1.7843077182769775, + "learning_rate": 1.6062762593188896e-07, + "loss": 1.0008, + "step": 9400 + }, + { + "epoch": 2.8937348563516787, + "grad_norm": 1.4365227222442627, + "learning_rate": 1.5610147195740943e-07, + "loss": 1.0936, + "step": 9405 + }, + { + "epoch": 2.8952732587208185, + "grad_norm": 1.1041733026504517, + "learning_rate": 1.5163980171658542e-07, + "loss": 1.171, + "step": 9410 + }, + { + "epoch": 2.896811661089958, + "grad_norm": 0.9337441921234131, + "learning_rate": 1.472426267899285e-07, + "loss": 1.1666, + "step": 9415 + }, + { + "epoch": 2.8983500634590977, + "grad_norm": 1.0685206651687622, + "learning_rate": 1.4290995859055633e-07, + "loss": 1.164, + "step": 9420 + }, + { + "epoch": 2.899888465828237, + "grad_norm": 1.5505434274673462, + "learning_rate": 1.386418083641483e-07, + "loss": 1.2611, + "step": 9425 + }, + { + "epoch": 2.901426868197377, + "grad_norm": 1.558901071548462, + "learning_rate": 1.3443818718893442e-07, + "loss": 1.1891, + "step": 9430 + }, + { + "epoch": 2.9029652705665168, + "grad_norm": 1.5380712747573853, + "learning_rate": 1.3029910597564532e-07, + "loss": 1.1009, + "step": 9435 + }, + { + "epoch": 2.9045036729356566, + "grad_norm": 1.8693783283233643, + "learning_rate": 1.2622457546749567e-07, + "loss": 1.1941, + "step": 9440 + }, + { + "epoch": 2.906042075304796, + "grad_norm": 1.0699331760406494, + "learning_rate": 1.2221460624016466e-07, + "loss": 1.1932, + "step": 9445 + }, + { + "epoch": 2.9075804776739353, + "grad_norm": 1.01662278175354, + "learning_rate": 1.1826920870174895e-07, + "loss": 1.2584, + "step": 9450 + }, + { + "epoch": 2.909118880043075, + "grad_norm": 1.0580191612243652, + "learning_rate": 1.143883930927514e-07, + "loss": 1.1569, + "step": 9455 + }, + { + "epoch": 2.910657282412215, + "grad_norm": 1.4499502182006836, + "learning_rate": 1.1057216948604509e-07, + "loss": 1.1326, + "step": 9460 + }, + { + "epoch": 2.912195684781355, + "grad_norm": 1.1739625930786133, + "learning_rate": 1.068205477868539e-07, + "loss": 1.011, + "step": 9465 + }, + { + "epoch": 2.913734087150494, + "grad_norm": 1.2261017560958862, + "learning_rate": 1.0313353773271917e-07, + "loss": 1.1797, + "step": 9470 + }, + { + "epoch": 2.915272489519634, + "grad_norm": 1.1943910121917725, + "learning_rate": 9.951114889348855e-08, + "loss": 1.1561, + "step": 9475 + }, + { + "epoch": 2.9168108918887734, + "grad_norm": 1.6418399810791016, + "learning_rate": 9.595339067127174e-08, + "loss": 1.084, + "step": 9480 + }, + { + "epoch": 2.9183492942579132, + "grad_norm": 1.0940558910369873, + "learning_rate": 9.24602723004292e-08, + "loss": 1.109, + "step": 9485 + }, + { + "epoch": 2.919887696627053, + "grad_norm": 1.4907678365707397, + "learning_rate": 8.903180284755008e-08, + "loss": 1.0309, + "step": 9490 + }, + { + "epoch": 2.9214260989961924, + "grad_norm": 1.6203380823135376, + "learning_rate": 8.566799121141334e-08, + "loss": 1.1081, + "step": 9495 + }, + { + "epoch": 2.9229645013653323, + "grad_norm": 1.3921715021133423, + "learning_rate": 8.23688461229849e-08, + "loss": 1.1672, + "step": 9500 + }, + { + "epoch": 2.9245029037344716, + "grad_norm": 1.6819571256637573, + "learning_rate": 7.913437614538166e-08, + "loss": 1.0484, + "step": 9505 + }, + { + "epoch": 2.9260413061036115, + "grad_norm": 1.9692211151123047, + "learning_rate": 7.596458967384922e-08, + "loss": 1.2351, + "step": 9510 + }, + { + "epoch": 2.9275797084727513, + "grad_norm": 1.2511972188949585, + "learning_rate": 7.285949493574806e-08, + "loss": 1.2732, + "step": 9515 + }, + { + "epoch": 2.9291181108418907, + "grad_norm": 0.920283854007721, + "learning_rate": 6.98190999905285e-08, + "loss": 1.1274, + "step": 9520 + }, + { + "epoch": 2.9306565132110305, + "grad_norm": 1.1254204511642456, + "learning_rate": 6.684341272970018e-08, + "loss": 1.023, + "step": 9525 + }, + { + "epoch": 2.93219491558017, + "grad_norm": 0.9248682856559753, + "learning_rate": 6.393244087683215e-08, + "loss": 1.2624, + "step": 9530 + }, + { + "epoch": 2.9337333179493097, + "grad_norm": 1.0681920051574707, + "learning_rate": 6.108619198751109e-08, + "loss": 1.1346, + "step": 9535 + }, + { + "epoch": 2.9352717203184495, + "grad_norm": 1.8738933801651, + "learning_rate": 5.8304673449338653e-08, + "loss": 1.1768, + "step": 9540 + }, + { + "epoch": 2.936810122687589, + "grad_norm": 1.5245164632797241, + "learning_rate": 5.558789248190366e-08, + "loss": 1.143, + "step": 9545 + }, + { + "epoch": 2.9383485250567287, + "grad_norm": 1.1693507432937622, + "learning_rate": 5.293585613675989e-08, + "loss": 1.1751, + "step": 9550 + }, + { + "epoch": 2.939886927425868, + "grad_norm": 1.6646199226379395, + "learning_rate": 5.034857129741777e-08, + "loss": 1.122, + "step": 9555 + }, + { + "epoch": 2.941425329795008, + "grad_norm": 1.3896692991256714, + "learning_rate": 4.782604467931939e-08, + "loss": 1.1138, + "step": 9560 + }, + { + "epoch": 2.9429637321641477, + "grad_norm": 1.3115090131759644, + "learning_rate": 4.5368282829827415e-08, + "loss": 1.1148, + "step": 9565 + }, + { + "epoch": 2.944502134533287, + "grad_norm": 1.0006541013717651, + "learning_rate": 4.2975292128200064e-08, + "loss": 1.2622, + "step": 9570 + }, + { + "epoch": 2.946040536902427, + "grad_norm": 1.6213256120681763, + "learning_rate": 4.064707878557728e-08, + "loss": 1.2263, + "step": 9575 + }, + { + "epoch": 2.9475789392715663, + "grad_norm": 4.496711254119873, + "learning_rate": 3.838364884496681e-08, + "loss": 1.2043, + "step": 9580 + }, + { + "epoch": 2.949117341640706, + "grad_norm": 1.6494616270065308, + "learning_rate": 3.618500818123039e-08, + "loss": 1.1806, + "step": 9585 + }, + { + "epoch": 2.950655744009846, + "grad_norm": 1.9413509368896484, + "learning_rate": 3.405116250106144e-08, + "loss": 1.2, + "step": 9590 + }, + { + "epoch": 2.9521941463789854, + "grad_norm": 2.1733412742614746, + "learning_rate": 3.1982117342979624e-08, + "loss": 1.0731, + "step": 9595 + }, + { + "epoch": 2.953732548748125, + "grad_norm": 1.8069325685501099, + "learning_rate": 2.9977878077305785e-08, + "loss": 1.2088, + "step": 9600 + }, + { + "epoch": 2.9552709511172646, + "grad_norm": 2.179551362991333, + "learning_rate": 2.8038449906153673e-08, + "loss": 1.082, + "step": 9605 + }, + { + "epoch": 2.9568093534864044, + "grad_norm": 1.2581409215927124, + "learning_rate": 2.6163837863418806e-08, + "loss": 1.076, + "step": 9610 + }, + { + "epoch": 2.958347755855544, + "grad_norm": 1.4859249591827393, + "learning_rate": 2.4354046814764607e-08, + "loss": 1.085, + "step": 9615 + }, + { + "epoch": 2.9598861582246836, + "grad_norm": 1.2801562547683716, + "learning_rate": 2.260908145760299e-08, + "loss": 1.1067, + "step": 9620 + }, + { + "epoch": 2.9614245605938234, + "grad_norm": 1.3596243858337402, + "learning_rate": 2.0928946321091547e-08, + "loss": 1.2352, + "step": 9625 + }, + { + "epoch": 2.962962962962963, + "grad_norm": 1.4657814502716064, + "learning_rate": 1.931364576611139e-08, + "loss": 1.1577, + "step": 9630 + }, + { + "epoch": 2.9645013653321026, + "grad_norm": 1.2036793231964111, + "learning_rate": 1.7763183985269883e-08, + "loss": 1.1403, + "step": 9635 + }, + { + "epoch": 2.9660397677012424, + "grad_norm": 2.143611192703247, + "learning_rate": 1.6277565002875696e-08, + "loss": 1.0896, + "step": 9640 + }, + { + "epoch": 2.967578170070382, + "grad_norm": 1.2435050010681152, + "learning_rate": 1.4856792674936004e-08, + "loss": 1.2366, + "step": 9645 + }, + { + "epoch": 2.9691165724395217, + "grad_norm": 1.358459711074829, + "learning_rate": 1.3500870689145407e-08, + "loss": 1.2478, + "step": 9650 + }, + { + "epoch": 2.970654974808661, + "grad_norm": 1.11142897605896, + "learning_rate": 1.2209802564877582e-08, + "loss": 1.2521, + "step": 9655 + }, + { + "epoch": 2.972193377177801, + "grad_norm": 1.6148406267166138, + "learning_rate": 1.0983591653168645e-08, + "loss": 1.275, + "step": 9660 + }, + { + "epoch": 2.9737317795469407, + "grad_norm": 0.9854974150657654, + "learning_rate": 9.822241136722699e-09, + "loss": 1.0939, + "step": 9665 + }, + { + "epoch": 2.97527018191608, + "grad_norm": 1.0898098945617676, + "learning_rate": 8.72575402988407e-09, + "loss": 1.1264, + "step": 9670 + }, + { + "epoch": 2.97680858428522, + "grad_norm": 1.4026813507080078, + "learning_rate": 7.694133178653973e-09, + "loss": 1.0411, + "step": 9675 + }, + { + "epoch": 2.9783469866543593, + "grad_norm": 1.993453025817871, + "learning_rate": 6.727381260657195e-09, + "loss": 1.2356, + "step": 9680 + }, + { + "epoch": 2.979885389023499, + "grad_norm": 1.4184306859970093, + "learning_rate": 5.825500785150428e-09, + "loss": 1.139, + "step": 9685 + }, + { + "epoch": 2.981423791392639, + "grad_norm": 1.8053029775619507, + "learning_rate": 4.988494093022267e-09, + "loss": 1.1943, + "step": 9690 + }, + { + "epoch": 2.9829621937617783, + "grad_norm": 1.1999495029449463, + "learning_rate": 4.216363356765452e-09, + "loss": 1.2553, + "step": 9695 + }, + { + "epoch": 2.984500596130918, + "grad_norm": 1.530155062675476, + "learning_rate": 3.5091105804907487e-09, + "loss": 1.1807, + "step": 9700 + }, + { + "epoch": 2.9860389985000575, + "grad_norm": 1.3728301525115967, + "learning_rate": 2.8667375999102964e-09, + "loss": 1.2475, + "step": 9705 + }, + { + "epoch": 2.9875774008691973, + "grad_norm": 1.2360360622406006, + "learning_rate": 2.2892460823403794e-09, + "loss": 0.998, + "step": 9710 + }, + { + "epoch": 2.989115803238337, + "grad_norm": 1.8913969993591309, + "learning_rate": 1.7766375266931035e-09, + "loss": 1.0908, + "step": 9715 + }, + { + "epoch": 2.9906542056074765, + "grad_norm": 0.986847460269928, + "learning_rate": 1.328913263473619e-09, + "loss": 1.07, + "step": 9720 + }, + { + "epoch": 2.9921926079766163, + "grad_norm": 1.6557801961898804, + "learning_rate": 9.460744547745704e-10, + "loss": 1.1596, + "step": 9725 + }, + { + "epoch": 2.9937310103457557, + "grad_norm": 2.2507128715515137, + "learning_rate": 6.281220942733201e-10, + "loss": 1.0748, + "step": 9730 + }, + { + "epoch": 2.9952694127148956, + "grad_norm": 2.129380702972412, + "learning_rate": 3.750570072375004e-10, + "loss": 1.3533, + "step": 9735 + }, + { + "epoch": 2.9968078150840354, + "grad_norm": 1.2016565799713135, + "learning_rate": 1.8687985050558355e-10, + "loss": 1.0902, + "step": 9740 + }, + { + "epoch": 2.9983462174531748, + "grad_norm": 1.1874008178710938, + "learning_rate": 6.359111250908711e-11, + "loss": 1.1668, + "step": 9745 + }, + { + "epoch": 2.9998846198223146, + "grad_norm": 1.8799904584884644, + "learning_rate": 5.191113247593471e-12, + "loss": 1.1766, + "step": 9750 + }, + { + "epoch": 2.9998846198223146, + "step": 9750, + "total_flos": 7.742111986089984e+17, + "train_loss": 1.1629726183475593, + "train_runtime": 20329.8435, + "train_samples_per_second": 7.674, + "train_steps_per_second": 0.48 + } + ], + "logging_steps": 5, + "max_steps": 9750, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 7.742111986089984e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}