{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.996458087367178, "eval_steps": 500, "global_step": 1269, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011806375442739079, "grad_norm": 7.55494499206543, "learning_rate": 0.0002, "loss": 5.1759, "step": 5 }, { "epoch": 0.023612750885478158, "grad_norm": 2.301419496536255, "learning_rate": 0.0002, "loss": 2.4526, "step": 10 }, { "epoch": 0.03541912632821724, "grad_norm": 1.5132728815078735, "learning_rate": 0.0002, "loss": 1.7092, "step": 15 }, { "epoch": 0.047225501770956316, "grad_norm": 1.055909514427185, "learning_rate": 0.0002, "loss": 1.3026, "step": 20 }, { "epoch": 0.0590318772136954, "grad_norm": 0.9324924945831299, "learning_rate": 0.0002, "loss": 1.109, "step": 25 }, { "epoch": 0.07083825265643448, "grad_norm": 0.9395583271980286, "learning_rate": 0.0002, "loss": 1.0204, "step": 30 }, { "epoch": 0.08264462809917356, "grad_norm": 0.8448713421821594, "learning_rate": 0.0002, "loss": 1.0172, "step": 35 }, { "epoch": 0.09445100354191263, "grad_norm": 1.0636835098266602, "learning_rate": 0.0002, "loss": 0.9394, "step": 40 }, { "epoch": 0.10625737898465171, "grad_norm": 0.9075261950492859, "learning_rate": 0.0002, "loss": 0.8359, "step": 45 }, { "epoch": 0.1180637544273908, "grad_norm": 1.1507660150527954, "learning_rate": 0.0002, "loss": 0.7415, "step": 50 }, { "epoch": 0.12987012987012986, "grad_norm": 1.0154448747634888, "learning_rate": 0.0002, "loss": 0.7834, "step": 55 }, { "epoch": 0.14167650531286896, "grad_norm": 0.8421798348426819, "learning_rate": 0.0002, "loss": 0.7619, "step": 60 }, { "epoch": 0.15348288075560804, "grad_norm": 0.9407509565353394, "learning_rate": 0.0002, "loss": 0.6744, "step": 65 }, { "epoch": 0.1652892561983471, "grad_norm": 1.009687900543213, "learning_rate": 0.0002, "loss": 0.723, "step": 70 }, { "epoch": 0.1770956316410862, "grad_norm": 0.9465011358261108, "learning_rate": 0.0002, "loss": 0.6639, "step": 75 }, { "epoch": 0.18890200708382526, "grad_norm": 1.0731524229049683, "learning_rate": 0.0002, "loss": 0.5939, "step": 80 }, { "epoch": 0.20070838252656434, "grad_norm": 0.9168630242347717, "learning_rate": 0.0002, "loss": 0.5537, "step": 85 }, { "epoch": 0.21251475796930341, "grad_norm": 0.9696341156959534, "learning_rate": 0.0002, "loss": 0.5688, "step": 90 }, { "epoch": 0.2243211334120425, "grad_norm": 1.0401453971862793, "learning_rate": 0.0002, "loss": 0.5416, "step": 95 }, { "epoch": 0.2361275088547816, "grad_norm": 1.0246028900146484, "learning_rate": 0.0002, "loss": 0.4849, "step": 100 }, { "epoch": 0.24793388429752067, "grad_norm": 0.9119220972061157, "learning_rate": 0.0002, "loss": 0.4959, "step": 105 }, { "epoch": 0.2597402597402597, "grad_norm": 0.8840236067771912, "learning_rate": 0.0002, "loss": 0.4528, "step": 110 }, { "epoch": 0.2715466351829988, "grad_norm": 0.848628044128418, "learning_rate": 0.0002, "loss": 0.4388, "step": 115 }, { "epoch": 0.2833530106257379, "grad_norm": 0.9177646040916443, "learning_rate": 0.0002, "loss": 0.4215, "step": 120 }, { "epoch": 0.29515938606847697, "grad_norm": 1.17708420753479, "learning_rate": 0.0002, "loss": 0.4642, "step": 125 }, { "epoch": 0.3069657615112161, "grad_norm": 0.8881534337997437, "learning_rate": 0.0002, "loss": 0.4192, "step": 130 }, { "epoch": 0.3187721369539551, "grad_norm": 0.8596940040588379, "learning_rate": 0.0002, "loss": 0.4232, "step": 135 }, { "epoch": 0.3305785123966942, "grad_norm": 0.8401700854301453, "learning_rate": 0.0002, "loss": 0.4054, "step": 140 }, { "epoch": 0.34238488783943327, "grad_norm": 1.042466640472412, "learning_rate": 0.0002, "loss": 0.4332, "step": 145 }, { "epoch": 0.3541912632821724, "grad_norm": 0.8484603762626648, "learning_rate": 0.0002, "loss": 0.3929, "step": 150 }, { "epoch": 0.3659976387249115, "grad_norm": 0.9610188603401184, "learning_rate": 0.0002, "loss": 0.4034, "step": 155 }, { "epoch": 0.3778040141676505, "grad_norm": 0.8308151960372925, "learning_rate": 0.0002, "loss": 0.4012, "step": 160 }, { "epoch": 0.38961038961038963, "grad_norm": 0.8959755897521973, "learning_rate": 0.0002, "loss": 0.3792, "step": 165 }, { "epoch": 0.4014167650531287, "grad_norm": 0.8881285190582275, "learning_rate": 0.0002, "loss": 0.382, "step": 170 }, { "epoch": 0.4132231404958678, "grad_norm": 0.7179512977600098, "learning_rate": 0.0002, "loss": 0.3859, "step": 175 }, { "epoch": 0.42502951593860683, "grad_norm": 0.8755255937576294, "learning_rate": 0.0002, "loss": 0.3753, "step": 180 }, { "epoch": 0.43683589138134593, "grad_norm": 0.7031023502349854, "learning_rate": 0.0002, "loss": 0.3937, "step": 185 }, { "epoch": 0.448642266824085, "grad_norm": 0.9373682737350464, "learning_rate": 0.0002, "loss": 0.4184, "step": 190 }, { "epoch": 0.4604486422668241, "grad_norm": 0.7943665981292725, "learning_rate": 0.0002, "loss": 0.3453, "step": 195 }, { "epoch": 0.4722550177095632, "grad_norm": 0.8357701897621155, "learning_rate": 0.0002, "loss": 0.3384, "step": 200 }, { "epoch": 0.48406139315230223, "grad_norm": 0.9620676636695862, "learning_rate": 0.0002, "loss": 0.3625, "step": 205 }, { "epoch": 0.49586776859504134, "grad_norm": 0.8293341398239136, "learning_rate": 0.0002, "loss": 0.3554, "step": 210 }, { "epoch": 0.5076741440377804, "grad_norm": 0.8142374157905579, "learning_rate": 0.0002, "loss": 0.3265, "step": 215 }, { "epoch": 0.5194805194805194, "grad_norm": 1.0462541580200195, "learning_rate": 0.0002, "loss": 0.3414, "step": 220 }, { "epoch": 0.5312868949232585, "grad_norm": 0.8421686887741089, "learning_rate": 0.0002, "loss": 0.3556, "step": 225 }, { "epoch": 0.5430932703659976, "grad_norm": 0.8640539646148682, "learning_rate": 0.0002, "loss": 0.3509, "step": 230 }, { "epoch": 0.5548996458087367, "grad_norm": 0.8762169480323792, "learning_rate": 0.0002, "loss": 0.3655, "step": 235 }, { "epoch": 0.5667060212514758, "grad_norm": 0.9614400863647461, "learning_rate": 0.0002, "loss": 0.3591, "step": 240 }, { "epoch": 0.5785123966942148, "grad_norm": 0.8330496549606323, "learning_rate": 0.0002, "loss": 0.3354, "step": 245 }, { "epoch": 0.5903187721369539, "grad_norm": 0.912196695804596, "learning_rate": 0.0002, "loss": 0.3648, "step": 250 }, { "epoch": 0.602125147579693, "grad_norm": 0.8851457834243774, "learning_rate": 0.0002, "loss": 0.3415, "step": 255 }, { "epoch": 0.6139315230224321, "grad_norm": 1.043445110321045, "learning_rate": 0.0002, "loss": 0.3585, "step": 260 }, { "epoch": 0.6257378984651711, "grad_norm": 0.8299534320831299, "learning_rate": 0.0002, "loss": 0.3223, "step": 265 }, { "epoch": 0.6375442739079102, "grad_norm": 0.8248724937438965, "learning_rate": 0.0002, "loss": 0.3332, "step": 270 }, { "epoch": 0.6493506493506493, "grad_norm": 0.805209755897522, "learning_rate": 0.0002, "loss": 0.3598, "step": 275 }, { "epoch": 0.6611570247933884, "grad_norm": 0.9821737408638, "learning_rate": 0.0002, "loss": 0.3383, "step": 280 }, { "epoch": 0.6729634002361276, "grad_norm": 0.904973030090332, "learning_rate": 0.0002, "loss": 0.2975, "step": 285 }, { "epoch": 0.6847697756788665, "grad_norm": 0.9315093159675598, "learning_rate": 0.0002, "loss": 0.3383, "step": 290 }, { "epoch": 0.6965761511216056, "grad_norm": 0.9074394702911377, "learning_rate": 0.0002, "loss": 0.3154, "step": 295 }, { "epoch": 0.7083825265643447, "grad_norm": 0.9916189312934875, "learning_rate": 0.0002, "loss": 0.3481, "step": 300 }, { "epoch": 0.7201889020070839, "grad_norm": 0.8280041217803955, "learning_rate": 0.0002, "loss": 0.3513, "step": 305 }, { "epoch": 0.731995277449823, "grad_norm": 0.9083949327468872, "learning_rate": 0.0002, "loss": 0.3042, "step": 310 }, { "epoch": 0.743801652892562, "grad_norm": 1.078469157218933, "learning_rate": 0.0002, "loss": 0.3244, "step": 315 }, { "epoch": 0.755608028335301, "grad_norm": 0.9485755562782288, "learning_rate": 0.0002, "loss": 0.2851, "step": 320 }, { "epoch": 0.7674144037780402, "grad_norm": 0.9043041467666626, "learning_rate": 0.0002, "loss": 0.3141, "step": 325 }, { "epoch": 0.7792207792207793, "grad_norm": 0.8514854907989502, "learning_rate": 0.0002, "loss": 0.3107, "step": 330 }, { "epoch": 0.7910271546635183, "grad_norm": 0.989473819732666, "learning_rate": 0.0002, "loss": 0.3347, "step": 335 }, { "epoch": 0.8028335301062574, "grad_norm": 0.8648626208305359, "learning_rate": 0.0002, "loss": 0.3192, "step": 340 }, { "epoch": 0.8146399055489965, "grad_norm": 0.7870430946350098, "learning_rate": 0.0002, "loss": 0.283, "step": 345 }, { "epoch": 0.8264462809917356, "grad_norm": 0.7580920457839966, "learning_rate": 0.0002, "loss": 0.3007, "step": 350 }, { "epoch": 0.8382526564344747, "grad_norm": 0.7428032755851746, "learning_rate": 0.0002, "loss": 0.3093, "step": 355 }, { "epoch": 0.8500590318772137, "grad_norm": 0.868452787399292, "learning_rate": 0.0002, "loss": 0.3052, "step": 360 }, { "epoch": 0.8618654073199528, "grad_norm": 0.8676696419715881, "learning_rate": 0.0002, "loss": 0.2911, "step": 365 }, { "epoch": 0.8736717827626919, "grad_norm": 0.8491166830062866, "learning_rate": 0.0002, "loss": 0.3025, "step": 370 }, { "epoch": 0.885478158205431, "grad_norm": 0.8106136322021484, "learning_rate": 0.0002, "loss": 0.267, "step": 375 }, { "epoch": 0.89728453364817, "grad_norm": 0.8002142906188965, "learning_rate": 0.0002, "loss": 0.3136, "step": 380 }, { "epoch": 0.9090909090909091, "grad_norm": 0.790067732334137, "learning_rate": 0.0002, "loss": 0.2859, "step": 385 }, { "epoch": 0.9208972845336482, "grad_norm": 0.7977219223976135, "learning_rate": 0.0002, "loss": 0.3154, "step": 390 }, { "epoch": 0.9327036599763873, "grad_norm": 0.7339850664138794, "learning_rate": 0.0002, "loss": 0.2884, "step": 395 }, { "epoch": 0.9445100354191264, "grad_norm": 0.7909967303276062, "learning_rate": 0.0002, "loss": 0.3024, "step": 400 }, { "epoch": 0.9563164108618654, "grad_norm": 0.7345856428146362, "learning_rate": 0.0002, "loss": 0.2774, "step": 405 }, { "epoch": 0.9681227863046045, "grad_norm": 0.6886624693870544, "learning_rate": 0.0002, "loss": 0.2662, "step": 410 }, { "epoch": 0.9799291617473436, "grad_norm": 0.8391503095626831, "learning_rate": 0.0002, "loss": 0.2498, "step": 415 }, { "epoch": 0.9917355371900827, "grad_norm": 0.7378864288330078, "learning_rate": 0.0002, "loss": 0.2652, "step": 420 }, { "epoch": 1.0035419126328218, "grad_norm": 0.7516870498657227, "learning_rate": 0.0002, "loss": 0.2784, "step": 425 }, { "epoch": 1.0153482880755609, "grad_norm": 0.803993821144104, "learning_rate": 0.0002, "loss": 0.2513, "step": 430 }, { "epoch": 1.0271546635183, "grad_norm": 0.7285071015357971, "learning_rate": 0.0002, "loss": 0.2524, "step": 435 }, { "epoch": 1.0389610389610389, "grad_norm": 0.6480122804641724, "learning_rate": 0.0002, "loss": 0.235, "step": 440 }, { "epoch": 1.050767414403778, "grad_norm": 0.7714098691940308, "learning_rate": 0.0002, "loss": 0.2659, "step": 445 }, { "epoch": 1.062573789846517, "grad_norm": 0.914413332939148, "learning_rate": 0.0002, "loss": 0.2883, "step": 450 }, { "epoch": 1.0743801652892562, "grad_norm": 0.950078547000885, "learning_rate": 0.0002, "loss": 0.2542, "step": 455 }, { "epoch": 1.0861865407319953, "grad_norm": 0.8367085456848145, "learning_rate": 0.0002, "loss": 0.2575, "step": 460 }, { "epoch": 1.0979929161747344, "grad_norm": 0.8421709537506104, "learning_rate": 0.0002, "loss": 0.2362, "step": 465 }, { "epoch": 1.1097992916174735, "grad_norm": 0.7322567105293274, "learning_rate": 0.0002, "loss": 0.2161, "step": 470 }, { "epoch": 1.1216056670602126, "grad_norm": 0.750337541103363, "learning_rate": 0.0002, "loss": 0.2495, "step": 475 }, { "epoch": 1.1334120425029517, "grad_norm": 0.7660607099533081, "learning_rate": 0.0002, "loss": 0.2704, "step": 480 }, { "epoch": 1.1452184179456908, "grad_norm": 0.8482415676116943, "learning_rate": 0.0002, "loss": 0.25, "step": 485 }, { "epoch": 1.1570247933884297, "grad_norm": 0.6941173076629639, "learning_rate": 0.0002, "loss": 0.2488, "step": 490 }, { "epoch": 1.1688311688311688, "grad_norm": 0.6488157510757446, "learning_rate": 0.0002, "loss": 0.2057, "step": 495 }, { "epoch": 1.1806375442739079, "grad_norm": 0.7544688582420349, "learning_rate": 0.0002, "loss": 0.2622, "step": 500 }, { "epoch": 1.192443919716647, "grad_norm": 0.6548221111297607, "learning_rate": 0.0002, "loss": 0.2452, "step": 505 }, { "epoch": 1.204250295159386, "grad_norm": 0.6706327199935913, "learning_rate": 0.0002, "loss": 0.2374, "step": 510 }, { "epoch": 1.2160566706021252, "grad_norm": 0.8103892207145691, "learning_rate": 0.0002, "loss": 0.2272, "step": 515 }, { "epoch": 1.2278630460448643, "grad_norm": 0.8987573385238647, "learning_rate": 0.0002, "loss": 0.2231, "step": 520 }, { "epoch": 1.2396694214876034, "grad_norm": 0.8000391721725464, "learning_rate": 0.0002, "loss": 0.2167, "step": 525 }, { "epoch": 1.2514757969303423, "grad_norm": 0.6645796895027161, "learning_rate": 0.0002, "loss": 0.2523, "step": 530 }, { "epoch": 1.2632821723730814, "grad_norm": 0.712792158126831, "learning_rate": 0.0002, "loss": 0.2177, "step": 535 }, { "epoch": 1.2750885478158205, "grad_norm": 0.6801431775093079, "learning_rate": 0.0002, "loss": 0.2301, "step": 540 }, { "epoch": 1.2868949232585596, "grad_norm": 0.8651431798934937, "learning_rate": 0.0002, "loss": 0.2236, "step": 545 }, { "epoch": 1.2987012987012987, "grad_norm": 0.6562423706054688, "learning_rate": 0.0002, "loss": 0.2367, "step": 550 }, { "epoch": 1.3105076741440378, "grad_norm": 0.6282105445861816, "learning_rate": 0.0002, "loss": 0.2448, "step": 555 }, { "epoch": 1.322314049586777, "grad_norm": 0.6442841291427612, "learning_rate": 0.0002, "loss": 0.2225, "step": 560 }, { "epoch": 1.334120425029516, "grad_norm": 0.6362649202346802, "learning_rate": 0.0002, "loss": 0.2095, "step": 565 }, { "epoch": 1.345926800472255, "grad_norm": 0.6888054609298706, "learning_rate": 0.0002, "loss": 0.2234, "step": 570 }, { "epoch": 1.3577331759149942, "grad_norm": 0.7552103996276855, "learning_rate": 0.0002, "loss": 0.2472, "step": 575 }, { "epoch": 1.3695395513577333, "grad_norm": 0.6695733070373535, "learning_rate": 0.0002, "loss": 0.2483, "step": 580 }, { "epoch": 1.3813459268004722, "grad_norm": 0.7165626883506775, "learning_rate": 0.0002, "loss": 0.2352, "step": 585 }, { "epoch": 1.3931523022432113, "grad_norm": 0.6626814007759094, "learning_rate": 0.0002, "loss": 0.2336, "step": 590 }, { "epoch": 1.4049586776859504, "grad_norm": 0.6331655383110046, "learning_rate": 0.0002, "loss": 0.2432, "step": 595 }, { "epoch": 1.4167650531286895, "grad_norm": 0.7248314619064331, "learning_rate": 0.0002, "loss": 0.2326, "step": 600 }, { "epoch": 1.4285714285714286, "grad_norm": 0.65913987159729, "learning_rate": 0.0002, "loss": 0.2213, "step": 605 }, { "epoch": 1.4403778040141677, "grad_norm": 0.7278943657875061, "learning_rate": 0.0002, "loss": 0.2391, "step": 610 }, { "epoch": 1.4521841794569068, "grad_norm": 0.5790348649024963, "learning_rate": 0.0002, "loss": 0.2276, "step": 615 }, { "epoch": 1.4639905548996457, "grad_norm": 0.713111400604248, "learning_rate": 0.0002, "loss": 0.2263, "step": 620 }, { "epoch": 1.4757969303423848, "grad_norm": 0.806976854801178, "learning_rate": 0.0002, "loss": 0.2421, "step": 625 }, { "epoch": 1.487603305785124, "grad_norm": 0.7841593027114868, "learning_rate": 0.0002, "loss": 0.2287, "step": 630 }, { "epoch": 1.499409681227863, "grad_norm": 0.8648158311843872, "learning_rate": 0.0002, "loss": 0.218, "step": 635 }, { "epoch": 1.511216056670602, "grad_norm": 0.6897756457328796, "learning_rate": 0.0002, "loss": 0.2265, "step": 640 }, { "epoch": 1.5230224321133412, "grad_norm": 0.736971378326416, "learning_rate": 0.0002, "loss": 0.2394, "step": 645 }, { "epoch": 1.5348288075560803, "grad_norm": 0.705877959728241, "learning_rate": 0.0002, "loss": 0.211, "step": 650 }, { "epoch": 1.5466351829988194, "grad_norm": 0.7128683924674988, "learning_rate": 0.0002, "loss": 0.2245, "step": 655 }, { "epoch": 1.5584415584415585, "grad_norm": 1.0384072065353394, "learning_rate": 0.0002, "loss": 0.2179, "step": 660 }, { "epoch": 1.5702479338842976, "grad_norm": 0.7156550288200378, "learning_rate": 0.0002, "loss": 0.2154, "step": 665 }, { "epoch": 1.5820543093270367, "grad_norm": 0.6507942080497742, "learning_rate": 0.0002, "loss": 0.2124, "step": 670 }, { "epoch": 1.5938606847697758, "grad_norm": 0.6894711256027222, "learning_rate": 0.0002, "loss": 0.2353, "step": 675 }, { "epoch": 1.6056670602125147, "grad_norm": 0.7373411655426025, "learning_rate": 0.0002, "loss": 0.2352, "step": 680 }, { "epoch": 1.6174734356552538, "grad_norm": 0.5376583933830261, "learning_rate": 0.0002, "loss": 0.2043, "step": 685 }, { "epoch": 1.629279811097993, "grad_norm": 0.8895164132118225, "learning_rate": 0.0002, "loss": 0.2155, "step": 690 }, { "epoch": 1.641086186540732, "grad_norm": 0.8033216595649719, "learning_rate": 0.0002, "loss": 0.2265, "step": 695 }, { "epoch": 1.6528925619834711, "grad_norm": 0.6350634694099426, "learning_rate": 0.0002, "loss": 0.2144, "step": 700 }, { "epoch": 1.66469893742621, "grad_norm": 0.7593154907226562, "learning_rate": 0.0002, "loss": 0.225, "step": 705 }, { "epoch": 1.676505312868949, "grad_norm": 0.6299831867218018, "learning_rate": 0.0002, "loss": 0.2385, "step": 710 }, { "epoch": 1.6883116883116882, "grad_norm": 0.5315602421760559, "learning_rate": 0.0002, "loss": 0.197, "step": 715 }, { "epoch": 1.7001180637544273, "grad_norm": 0.6873396039009094, "learning_rate": 0.0002, "loss": 0.2164, "step": 720 }, { "epoch": 1.7119244391971664, "grad_norm": 0.8124886155128479, "learning_rate": 0.0002, "loss": 0.2238, "step": 725 }, { "epoch": 1.7237308146399055, "grad_norm": 0.59203040599823, "learning_rate": 0.0002, "loss": 0.2121, "step": 730 }, { "epoch": 1.7355371900826446, "grad_norm": 0.7568244934082031, "learning_rate": 0.0002, "loss": 0.2228, "step": 735 }, { "epoch": 1.7473435655253837, "grad_norm": 0.6371917128562927, "learning_rate": 0.0002, "loss": 0.197, "step": 740 }, { "epoch": 1.7591499409681228, "grad_norm": 0.8084881901741028, "learning_rate": 0.0002, "loss": 0.2304, "step": 745 }, { "epoch": 1.770956316410862, "grad_norm": 0.7568153142929077, "learning_rate": 0.0002, "loss": 0.2138, "step": 750 }, { "epoch": 1.782762691853601, "grad_norm": 0.8049147725105286, "learning_rate": 0.0002, "loss": 0.2327, "step": 755 }, { "epoch": 1.7945690672963401, "grad_norm": 0.6379196047782898, "learning_rate": 0.0002, "loss": 0.2219, "step": 760 }, { "epoch": 1.8063754427390792, "grad_norm": 0.6795427203178406, "learning_rate": 0.0002, "loss": 0.2016, "step": 765 }, { "epoch": 1.8181818181818183, "grad_norm": 0.5892528295516968, "learning_rate": 0.0002, "loss": 0.2013, "step": 770 }, { "epoch": 1.8299881936245572, "grad_norm": 0.7379834651947021, "learning_rate": 0.0002, "loss": 0.2158, "step": 775 }, { "epoch": 1.8417945690672963, "grad_norm": 0.70561283826828, "learning_rate": 0.0002, "loss": 0.2042, "step": 780 }, { "epoch": 1.8536009445100354, "grad_norm": 0.7286373376846313, "learning_rate": 0.0002, "loss": 0.2135, "step": 785 }, { "epoch": 1.8654073199527745, "grad_norm": 0.6097682118415833, "learning_rate": 0.0002, "loss": 0.2237, "step": 790 }, { "epoch": 1.8772136953955134, "grad_norm": 0.7925320863723755, "learning_rate": 0.0002, "loss": 0.2303, "step": 795 }, { "epoch": 1.8890200708382525, "grad_norm": 0.673173725605011, "learning_rate": 0.0002, "loss": 0.2078, "step": 800 }, { "epoch": 1.9008264462809916, "grad_norm": 0.9153968095779419, "learning_rate": 0.0002, "loss": 0.2138, "step": 805 }, { "epoch": 1.9126328217237307, "grad_norm": 0.6706417202949524, "learning_rate": 0.0002, "loss": 0.2215, "step": 810 }, { "epoch": 1.9244391971664698, "grad_norm": 0.5349032878875732, "learning_rate": 0.0002, "loss": 0.1891, "step": 815 }, { "epoch": 1.936245572609209, "grad_norm": 0.6375519037246704, "learning_rate": 0.0002, "loss": 0.2169, "step": 820 }, { "epoch": 1.948051948051948, "grad_norm": 0.7146507501602173, "learning_rate": 0.0002, "loss": 0.1986, "step": 825 }, { "epoch": 1.9598583234946871, "grad_norm": 0.6205456256866455, "learning_rate": 0.0002, "loss": 0.2038, "step": 830 }, { "epoch": 1.9716646989374262, "grad_norm": 0.6656145453453064, "learning_rate": 0.0002, "loss": 0.2308, "step": 835 }, { "epoch": 1.9834710743801653, "grad_norm": 0.8163105249404907, "learning_rate": 0.0002, "loss": 0.1993, "step": 840 }, { "epoch": 1.9952774498229044, "grad_norm": 0.9479507207870483, "learning_rate": 0.0002, "loss": 0.1947, "step": 845 }, { "epoch": 2.0070838252656436, "grad_norm": 0.5905447602272034, "learning_rate": 0.0002, "loss": 0.2108, "step": 850 }, { "epoch": 2.0188902007083827, "grad_norm": 0.6258553266525269, "learning_rate": 0.0002, "loss": 0.1711, "step": 855 }, { "epoch": 2.0306965761511218, "grad_norm": 0.5849653482437134, "learning_rate": 0.0002, "loss": 0.1699, "step": 860 }, { "epoch": 2.042502951593861, "grad_norm": 0.5061454176902771, "learning_rate": 0.0002, "loss": 0.1721, "step": 865 }, { "epoch": 2.0543093270366, "grad_norm": 0.5439514517784119, "learning_rate": 0.0002, "loss": 0.1859, "step": 870 }, { "epoch": 2.0661157024793386, "grad_norm": 0.5932187438011169, "learning_rate": 0.0002, "loss": 0.1566, "step": 875 }, { "epoch": 2.0779220779220777, "grad_norm": 0.6605520844459534, "learning_rate": 0.0002, "loss": 0.1852, "step": 880 }, { "epoch": 2.089728453364817, "grad_norm": 0.5616372227668762, "learning_rate": 0.0002, "loss": 0.1481, "step": 885 }, { "epoch": 2.101534828807556, "grad_norm": 0.6735979914665222, "learning_rate": 0.0002, "loss": 0.172, "step": 890 }, { "epoch": 2.113341204250295, "grad_norm": 0.5019059181213379, "learning_rate": 0.0002, "loss": 0.1603, "step": 895 }, { "epoch": 2.125147579693034, "grad_norm": 0.5685228109359741, "learning_rate": 0.0002, "loss": 0.173, "step": 900 }, { "epoch": 2.1369539551357732, "grad_norm": 0.5904942154884338, "learning_rate": 0.0002, "loss": 0.1827, "step": 905 }, { "epoch": 2.1487603305785123, "grad_norm": 0.9687920808792114, "learning_rate": 0.0002, "loss": 0.1636, "step": 910 }, { "epoch": 2.1605667060212514, "grad_norm": 0.5926781892776489, "learning_rate": 0.0002, "loss": 0.178, "step": 915 }, { "epoch": 2.1723730814639906, "grad_norm": 0.7833774089813232, "learning_rate": 0.0002, "loss": 0.1884, "step": 920 }, { "epoch": 2.1841794569067297, "grad_norm": 0.5398828387260437, "learning_rate": 0.0002, "loss": 0.1524, "step": 925 }, { "epoch": 2.1959858323494688, "grad_norm": 0.7538560628890991, "learning_rate": 0.0002, "loss": 0.196, "step": 930 }, { "epoch": 2.207792207792208, "grad_norm": 0.5702058672904968, "learning_rate": 0.0002, "loss": 0.1895, "step": 935 }, { "epoch": 2.219598583234947, "grad_norm": 0.640801727771759, "learning_rate": 0.0002, "loss": 0.1766, "step": 940 }, { "epoch": 2.231404958677686, "grad_norm": 0.5538972020149231, "learning_rate": 0.0002, "loss": 0.1821, "step": 945 }, { "epoch": 2.243211334120425, "grad_norm": 0.49248647689819336, "learning_rate": 0.0002, "loss": 0.169, "step": 950 }, { "epoch": 2.2550177095631643, "grad_norm": 0.6323942542076111, "learning_rate": 0.0002, "loss": 0.1581, "step": 955 }, { "epoch": 2.2668240850059034, "grad_norm": 0.5410608053207397, "learning_rate": 0.0002, "loss": 0.1459, "step": 960 }, { "epoch": 2.2786304604486425, "grad_norm": 0.63059002161026, "learning_rate": 0.0002, "loss": 0.1448, "step": 965 }, { "epoch": 2.2904368358913816, "grad_norm": 0.5819523334503174, "learning_rate": 0.0002, "loss": 0.174, "step": 970 }, { "epoch": 2.3022432113341202, "grad_norm": 0.626478374004364, "learning_rate": 0.0002, "loss": 0.1719, "step": 975 }, { "epoch": 2.3140495867768593, "grad_norm": 0.5042787194252014, "learning_rate": 0.0002, "loss": 0.1698, "step": 980 }, { "epoch": 2.3258559622195984, "grad_norm": 0.5394904017448425, "learning_rate": 0.0002, "loss": 0.1642, "step": 985 }, { "epoch": 2.3376623376623376, "grad_norm": 0.521476149559021, "learning_rate": 0.0002, "loss": 0.1768, "step": 990 }, { "epoch": 2.3494687131050767, "grad_norm": 0.5270056128501892, "learning_rate": 0.0002, "loss": 0.1656, "step": 995 }, { "epoch": 2.3612750885478158, "grad_norm": 0.7696222066879272, "learning_rate": 0.0002, "loss": 0.1809, "step": 1000 }, { "epoch": 2.373081463990555, "grad_norm": 0.69610196352005, "learning_rate": 0.0002, "loss": 0.1785, "step": 1005 }, { "epoch": 2.384887839433294, "grad_norm": 0.6440519690513611, "learning_rate": 0.0002, "loss": 0.1755, "step": 1010 }, { "epoch": 2.396694214876033, "grad_norm": 0.5550039410591125, "learning_rate": 0.0002, "loss": 0.16, "step": 1015 }, { "epoch": 2.408500590318772, "grad_norm": 0.6378772854804993, "learning_rate": 0.0002, "loss": 0.1665, "step": 1020 }, { "epoch": 2.4203069657615113, "grad_norm": 0.6123429536819458, "learning_rate": 0.0002, "loss": 0.157, "step": 1025 }, { "epoch": 2.4321133412042504, "grad_norm": 0.589279055595398, "learning_rate": 0.0002, "loss": 0.1552, "step": 1030 }, { "epoch": 2.4439197166469895, "grad_norm": 0.5348511934280396, "learning_rate": 0.0002, "loss": 0.1608, "step": 1035 }, { "epoch": 2.4557260920897286, "grad_norm": 0.7544596195220947, "learning_rate": 0.0002, "loss": 0.1667, "step": 1040 }, { "epoch": 2.4675324675324677, "grad_norm": 0.5579289793968201, "learning_rate": 0.0002, "loss": 0.1692, "step": 1045 }, { "epoch": 2.479338842975207, "grad_norm": 0.5934975743293762, "learning_rate": 0.0002, "loss": 0.1676, "step": 1050 }, { "epoch": 2.4911452184179455, "grad_norm": 0.5102668404579163, "learning_rate": 0.0002, "loss": 0.1419, "step": 1055 }, { "epoch": 2.5029515938606846, "grad_norm": 0.6462379693984985, "learning_rate": 0.0002, "loss": 0.1904, "step": 1060 }, { "epoch": 2.5147579693034237, "grad_norm": 0.6032257080078125, "learning_rate": 0.0002, "loss": 0.142, "step": 1065 }, { "epoch": 2.5265643447461628, "grad_norm": 0.7131995558738708, "learning_rate": 0.0002, "loss": 0.1806, "step": 1070 }, { "epoch": 2.538370720188902, "grad_norm": 0.5559495091438293, "learning_rate": 0.0002, "loss": 0.1646, "step": 1075 }, { "epoch": 2.550177095631641, "grad_norm": 0.5779125690460205, "learning_rate": 0.0002, "loss": 0.1555, "step": 1080 }, { "epoch": 2.56198347107438, "grad_norm": 0.5030808448791504, "learning_rate": 0.0002, "loss": 0.1345, "step": 1085 }, { "epoch": 2.573789846517119, "grad_norm": 0.6528759598731995, "learning_rate": 0.0002, "loss": 0.1656, "step": 1090 }, { "epoch": 2.5855962219598583, "grad_norm": 0.5437236428260803, "learning_rate": 0.0002, "loss": 0.1385, "step": 1095 }, { "epoch": 2.5974025974025974, "grad_norm": 0.5143546462059021, "learning_rate": 0.0002, "loss": 0.1438, "step": 1100 }, { "epoch": 2.6092089728453365, "grad_norm": 0.6868805885314941, "learning_rate": 0.0002, "loss": 0.1557, "step": 1105 }, { "epoch": 2.6210153482880756, "grad_norm": 0.6440519690513611, "learning_rate": 0.0002, "loss": 0.1493, "step": 1110 }, { "epoch": 2.6328217237308147, "grad_norm": 0.5339066386222839, "learning_rate": 0.0002, "loss": 0.1615, "step": 1115 }, { "epoch": 2.644628099173554, "grad_norm": 0.5612460970878601, "learning_rate": 0.0002, "loss": 0.1588, "step": 1120 }, { "epoch": 2.656434474616293, "grad_norm": 0.738273024559021, "learning_rate": 0.0002, "loss": 0.1751, "step": 1125 }, { "epoch": 2.668240850059032, "grad_norm": 0.5570853352546692, "learning_rate": 0.0002, "loss": 0.1449, "step": 1130 }, { "epoch": 2.680047225501771, "grad_norm": 0.646135151386261, "learning_rate": 0.0002, "loss": 0.1802, "step": 1135 }, { "epoch": 2.69185360094451, "grad_norm": 0.5823525190353394, "learning_rate": 0.0002, "loss": 0.1689, "step": 1140 }, { "epoch": 2.7036599763872493, "grad_norm": 0.5457978248596191, "learning_rate": 0.0002, "loss": 0.1542, "step": 1145 }, { "epoch": 2.7154663518299884, "grad_norm": 0.6531795859336853, "learning_rate": 0.0002, "loss": 0.1572, "step": 1150 }, { "epoch": 2.7272727272727275, "grad_norm": 0.494403600692749, "learning_rate": 0.0002, "loss": 0.1636, "step": 1155 }, { "epoch": 2.7390791027154666, "grad_norm": 0.5932457447052002, "learning_rate": 0.0002, "loss": 0.1689, "step": 1160 }, { "epoch": 2.7508854781582053, "grad_norm": 0.6143500208854675, "learning_rate": 0.0002, "loss": 0.1794, "step": 1165 }, { "epoch": 2.7626918536009444, "grad_norm": 0.673255205154419, "learning_rate": 0.0002, "loss": 0.1659, "step": 1170 }, { "epoch": 2.7744982290436835, "grad_norm": 0.6782878637313843, "learning_rate": 0.0002, "loss": 0.1794, "step": 1175 }, { "epoch": 2.7863046044864226, "grad_norm": 0.5923856496810913, "learning_rate": 0.0002, "loss": 0.1751, "step": 1180 }, { "epoch": 2.7981109799291617, "grad_norm": 0.6033382415771484, "learning_rate": 0.0002, "loss": 0.1601, "step": 1185 }, { "epoch": 2.809917355371901, "grad_norm": 0.6074445843696594, "learning_rate": 0.0002, "loss": 0.1507, "step": 1190 }, { "epoch": 2.82172373081464, "grad_norm": 0.6589583158493042, "learning_rate": 0.0002, "loss": 0.1598, "step": 1195 }, { "epoch": 2.833530106257379, "grad_norm": 0.6717161536216736, "learning_rate": 0.0002, "loss": 0.1655, "step": 1200 }, { "epoch": 2.845336481700118, "grad_norm": 0.6025908589363098, "learning_rate": 0.0002, "loss": 0.1509, "step": 1205 }, { "epoch": 2.857142857142857, "grad_norm": 0.4749529957771301, "learning_rate": 0.0002, "loss": 0.1436, "step": 1210 }, { "epoch": 2.8689492325855963, "grad_norm": 0.49929025769233704, "learning_rate": 0.0002, "loss": 0.1476, "step": 1215 }, { "epoch": 2.8807556080283354, "grad_norm": 0.567028820514679, "learning_rate": 0.0002, "loss": 0.1651, "step": 1220 }, { "epoch": 2.8925619834710745, "grad_norm": 0.5941957235336304, "learning_rate": 0.0002, "loss": 0.1534, "step": 1225 }, { "epoch": 2.9043683589138136, "grad_norm": 0.6283835172653198, "learning_rate": 0.0002, "loss": 0.1719, "step": 1230 }, { "epoch": 2.9161747343565523, "grad_norm": 0.6478221416473389, "learning_rate": 0.0002, "loss": 0.1799, "step": 1235 }, { "epoch": 2.9279811097992914, "grad_norm": 0.4803556799888611, "learning_rate": 0.0002, "loss": 0.1716, "step": 1240 }, { "epoch": 2.9397874852420305, "grad_norm": 0.6615630984306335, "learning_rate": 0.0002, "loss": 0.1558, "step": 1245 }, { "epoch": 2.9515938606847696, "grad_norm": 0.5517375469207764, "learning_rate": 0.0002, "loss": 0.1912, "step": 1250 }, { "epoch": 2.9634002361275087, "grad_norm": 0.5428590178489685, "learning_rate": 0.0002, "loss": 0.1503, "step": 1255 }, { "epoch": 2.975206611570248, "grad_norm": 0.5363365411758423, "learning_rate": 0.0002, "loss": 0.1604, "step": 1260 }, { "epoch": 2.987012987012987, "grad_norm": 0.6846977472305298, "learning_rate": 0.0002, "loss": 0.1692, "step": 1265 } ], "logging_steps": 5, "max_steps": 1269, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.739287825861544e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }