{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.841918294849023, "eval_steps": 500, "global_step": 1200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011841326228537596, "grad_norm": 0.6710847020149231, "learning_rate": 4.9998075682257415e-05, "loss": 3.876, "step": 5 }, { "epoch": 0.023682652457075192, "grad_norm": 0.8348745107650757, "learning_rate": 4.9992303025269555e-05, "loss": 3.8538, "step": 10 }, { "epoch": 0.035523978685612786, "grad_norm": 2.387429714202881, "learning_rate": 4.9982682917710524e-05, "loss": 3.5742, "step": 15 }, { "epoch": 0.047365304914150384, "grad_norm": 1.0946344137191772, "learning_rate": 4.996921684055182e-05, "loss": 3.5193, "step": 20 }, { "epoch": 0.05920663114268798, "grad_norm": 1.1773927211761475, "learning_rate": 4.9951906866834316e-05, "loss": 2.9465, "step": 25 }, { "epoch": 0.07104795737122557, "grad_norm": 1.147207260131836, "learning_rate": 4.993075566134921e-05, "loss": 3.0208, "step": 30 }, { "epoch": 0.08288928359976318, "grad_norm": 1.8781776428222656, "learning_rate": 4.990576648022768e-05, "loss": 2.7173, "step": 35 }, { "epoch": 0.09473060982830077, "grad_norm": 1.009155511856079, "learning_rate": 4.987694317043969e-05, "loss": 2.6235, "step": 40 }, { "epoch": 0.10657193605683836, "grad_norm": 0.9919832944869995, "learning_rate": 4.984429016920178e-05, "loss": 2.4021, "step": 45 }, { "epoch": 0.11841326228537596, "grad_norm": 0.8563632965087891, "learning_rate": 4.980781250329389e-05, "loss": 2.1688, "step": 50 }, { "epoch": 0.13025458851391356, "grad_norm": 0.7374653816223145, "learning_rate": 4.976751578828562e-05, "loss": 2.2943, "step": 55 }, { "epoch": 0.14209591474245115, "grad_norm": 0.9438661336898804, "learning_rate": 4.9723406227671643e-05, "loss": 1.9571, "step": 60 }, { "epoch": 0.15393724097098876, "grad_norm": 0.9622551202774048, "learning_rate": 4.967549061191679e-05, "loss": 2.1024, "step": 65 }, { "epoch": 0.16577856719952636, "grad_norm": 0.6480829119682312, "learning_rate": 4.96237763174106e-05, "loss": 1.9145, "step": 70 }, { "epoch": 0.17761989342806395, "grad_norm": 1.2615852355957031, "learning_rate": 4.956827130533185e-05, "loss": 2.1472, "step": 75 }, { "epoch": 0.18946121965660154, "grad_norm": 1.0979194641113281, "learning_rate": 4.95089841204229e-05, "loss": 2.0441, "step": 80 }, { "epoch": 0.20130254588513913, "grad_norm": 0.8610992431640625, "learning_rate": 4.944592388967428e-05, "loss": 2.1605, "step": 85 }, { "epoch": 0.21314387211367672, "grad_norm": 1.0579572916030884, "learning_rate": 4.937910032091968e-05, "loss": 2.2801, "step": 90 }, { "epoch": 0.22498519834221434, "grad_norm": 1.0034611225128174, "learning_rate": 4.930852370134141e-05, "loss": 2.2327, "step": 95 }, { "epoch": 0.23682652457075193, "grad_norm": 0.8670480847358704, "learning_rate": 4.923420489588677e-05, "loss": 2.0031, "step": 100 }, { "epoch": 0.24866785079928952, "grad_norm": 0.8441548347473145, "learning_rate": 4.9156155345595445e-05, "loss": 1.8637, "step": 105 }, { "epoch": 0.2605091770278271, "grad_norm": 0.8399316072463989, "learning_rate": 4.907438706583818e-05, "loss": 1.9813, "step": 110 }, { "epoch": 0.27235050325636473, "grad_norm": 0.9002792835235596, "learning_rate": 4.898891264446709e-05, "loss": 2.1698, "step": 115 }, { "epoch": 0.2841918294849023, "grad_norm": 1.1735421419143677, "learning_rate": 4.8899745239877845e-05, "loss": 2.1691, "step": 120 }, { "epoch": 0.2960331557134399, "grad_norm": 1.2177324295043945, "learning_rate": 4.880689857898392e-05, "loss": 2.1437, "step": 125 }, { "epoch": 0.30787448194197753, "grad_norm": 1.062441349029541, "learning_rate": 4.871038695510347e-05, "loss": 2.078, "step": 130 }, { "epoch": 0.3197158081705151, "grad_norm": 0.8719960451126099, "learning_rate": 4.861022522575892e-05, "loss": 2.0338, "step": 135 }, { "epoch": 0.3315571343990527, "grad_norm": 1.042143702507019, "learning_rate": 4.8506428810389696e-05, "loss": 2.1387, "step": 140 }, { "epoch": 0.3433984606275903, "grad_norm": 0.8554458022117615, "learning_rate": 4.839901368797849e-05, "loss": 2.0853, "step": 145 }, { "epoch": 0.3552397868561279, "grad_norm": 1.033553957939148, "learning_rate": 4.828799639459138e-05, "loss": 1.9919, "step": 150 }, { "epoch": 0.36708111308466546, "grad_norm": 0.7688170075416565, "learning_rate": 4.8173394020832164e-05, "loss": 2.1078, "step": 155 }, { "epoch": 0.3789224393132031, "grad_norm": 0.9199577569961548, "learning_rate": 4.8055224209211316e-05, "loss": 1.9385, "step": 160 }, { "epoch": 0.3907637655417407, "grad_norm": 1.0168030261993408, "learning_rate": 4.793350515143007e-05, "loss": 1.8268, "step": 165 }, { "epoch": 0.40260509177027826, "grad_norm": 1.2735998630523682, "learning_rate": 4.780825558557981e-05, "loss": 1.8595, "step": 170 }, { "epoch": 0.4144464179988159, "grad_norm": 0.7815925478935242, "learning_rate": 4.767949479325748e-05, "loss": 2.005, "step": 175 }, { "epoch": 0.42628774422735344, "grad_norm": 0.9479308724403381, "learning_rate": 4.7547242596597274e-05, "loss": 2.0088, "step": 180 }, { "epoch": 0.43812907045589106, "grad_norm": 1.5356885194778442, "learning_rate": 4.7411519355219066e-05, "loss": 2.1591, "step": 185 }, { "epoch": 0.4499703966844287, "grad_norm": 1.3683360815048218, "learning_rate": 4.727234596309417e-05, "loss": 1.9772, "step": 190 }, { "epoch": 0.46181172291296624, "grad_norm": 1.2764394283294678, "learning_rate": 4.71297438453288e-05, "loss": 1.9476, "step": 195 }, { "epoch": 0.47365304914150386, "grad_norm": 1.4485411643981934, "learning_rate": 4.698373495486579e-05, "loss": 2.0495, "step": 200 }, { "epoch": 0.4854943753700414, "grad_norm": 0.9207140207290649, "learning_rate": 4.683434176910503e-05, "loss": 2.0267, "step": 205 }, { "epoch": 0.49733570159857904, "grad_norm": 1.0169957876205444, "learning_rate": 4.6681587286443146e-05, "loss": 1.8998, "step": 210 }, { "epoch": 0.5091770278271166, "grad_norm": 1.0399326086044312, "learning_rate": 4.652549502273304e-05, "loss": 1.9095, "step": 215 }, { "epoch": 0.5210183540556542, "grad_norm": 1.6732031106948853, "learning_rate": 4.636608900766372e-05, "loss": 1.935, "step": 220 }, { "epoch": 0.5328596802841918, "grad_norm": 1.1295055150985718, "learning_rate": 4.620339378106102e-05, "loss": 2.084, "step": 225 }, { "epoch": 0.5447010065127295, "grad_norm": 1.196055293083191, "learning_rate": 4.603743438910986e-05, "loss": 1.9387, "step": 230 }, { "epoch": 0.5565423327412671, "grad_norm": 1.519239902496338, "learning_rate": 4.586823638049841e-05, "loss": 1.871, "step": 235 }, { "epoch": 0.5683836589698046, "grad_norm": 1.1272022724151611, "learning_rate": 4.5695825802485085e-05, "loss": 1.9629, "step": 240 }, { "epoch": 0.5802249851983422, "grad_norm": 1.135977864265442, "learning_rate": 4.552022919688861e-05, "loss": 2.1182, "step": 245 }, { "epoch": 0.5920663114268798, "grad_norm": 1.3236933946609497, "learning_rate": 4.53414735960021e-05, "loss": 1.8874, "step": 250 }, { "epoch": 0.6039076376554174, "grad_norm": 2.230280637741089, "learning_rate": 4.51595865184315e-05, "loss": 1.8281, "step": 255 }, { "epoch": 0.6157489638839551, "grad_norm": 1.3642871379852295, "learning_rate": 4.497459596485924e-05, "loss": 2.0387, "step": 260 }, { "epoch": 0.6275902901124926, "grad_norm": 0.9770427942276001, "learning_rate": 4.47865304137337e-05, "loss": 2.1307, "step": 265 }, { "epoch": 0.6394316163410302, "grad_norm": 3.981346845626831, "learning_rate": 4.4595418816885004e-05, "loss": 2.0755, "step": 270 }, { "epoch": 0.6512729425695678, "grad_norm": 1.1023027896881104, "learning_rate": 4.440129059506808e-05, "loss": 1.9149, "step": 275 }, { "epoch": 0.6631142687981054, "grad_norm": 1.648698091506958, "learning_rate": 4.420417563343346e-05, "loss": 1.984, "step": 280 }, { "epoch": 0.6749555950266429, "grad_norm": 1.157861590385437, "learning_rate": 4.40041042769266e-05, "loss": 1.8697, "step": 285 }, { "epoch": 0.6867969212551805, "grad_norm": 1.0009181499481201, "learning_rate": 4.380110732561637e-05, "loss": 1.7951, "step": 290 }, { "epoch": 0.6986382474837182, "grad_norm": 2.11307692527771, "learning_rate": 4.3595216029953575e-05, "loss": 1.6276, "step": 295 }, { "epoch": 0.7104795737122558, "grad_norm": 1.0800515413284302, "learning_rate": 4.3386462085960086e-05, "loss": 1.8438, "step": 300 }, { "epoch": 0.7223208999407934, "grad_norm": 0.9078517556190491, "learning_rate": 4.3174877630349366e-05, "loss": 1.7839, "step": 305 }, { "epoch": 0.7341622261693309, "grad_norm": 1.0842117071151733, "learning_rate": 4.296049523557917e-05, "loss": 1.8604, "step": 310 }, { "epoch": 0.7460035523978685, "grad_norm": 1.2695246934890747, "learning_rate": 4.2743347904837176e-05, "loss": 1.9514, "step": 315 }, { "epoch": 0.7578448786264061, "grad_norm": 1.0501705408096313, "learning_rate": 4.2523469066960295e-05, "loss": 1.9439, "step": 320 }, { "epoch": 0.7696862048549438, "grad_norm": 1.3608386516571045, "learning_rate": 4.230089257128842e-05, "loss": 2.0266, "step": 325 }, { "epoch": 0.7815275310834814, "grad_norm": 1.1087582111358643, "learning_rate": 4.2075652682453554e-05, "loss": 1.9851, "step": 330 }, { "epoch": 0.7933688573120189, "grad_norm": 1.4247711896896362, "learning_rate": 4.184778407510484e-05, "loss": 1.976, "step": 335 }, { "epoch": 0.8052101835405565, "grad_norm": 1.5246973037719727, "learning_rate": 4.16173218285706e-05, "loss": 1.9039, "step": 340 }, { "epoch": 0.8170515097690941, "grad_norm": 0.8650221824645996, "learning_rate": 4.138430142145805e-05, "loss": 1.9577, "step": 345 }, { "epoch": 0.8288928359976317, "grad_norm": 1.5152937173843384, "learning_rate": 4.114875872619147e-05, "loss": 1.9182, "step": 350 }, { "epoch": 0.8407341622261694, "grad_norm": 1.2927278280258179, "learning_rate": 4.0910730003489894e-05, "loss": 1.9574, "step": 355 }, { "epoch": 0.8525754884547069, "grad_norm": 1.193464756011963, "learning_rate": 4.067025189678485e-05, "loss": 1.9129, "step": 360 }, { "epoch": 0.8644168146832445, "grad_norm": 1.1596301794052124, "learning_rate": 4.042736142657935e-05, "loss": 2.0108, "step": 365 }, { "epoch": 0.8762581409117821, "grad_norm": 1.278910756111145, "learning_rate": 4.018209598474869e-05, "loss": 2.1509, "step": 370 }, { "epoch": 0.8880994671403197, "grad_norm": 0.9644595384597778, "learning_rate": 3.993449332878418e-05, "loss": 2.01, "step": 375 }, { "epoch": 0.8999407933688574, "grad_norm": 1.2030296325683594, "learning_rate": 3.9684591575980546e-05, "loss": 2.0312, "step": 380 }, { "epoch": 0.9117821195973949, "grad_norm": 1.4285073280334473, "learning_rate": 3.943242919756792e-05, "loss": 1.9705, "step": 385 }, { "epoch": 0.9236234458259325, "grad_norm": 1.1423532962799072, "learning_rate": 3.917804501278942e-05, "loss": 2.1293, "step": 390 }, { "epoch": 0.9354647720544701, "grad_norm": 1.3577874898910522, "learning_rate": 3.8921478182925055e-05, "loss": 2.1897, "step": 395 }, { "epoch": 0.9473060982830077, "grad_norm": 1.166759967803955, "learning_rate": 3.8662768205263044e-05, "loss": 1.966, "step": 400 }, { "epoch": 0.9591474245115453, "grad_norm": 0.8779304623603821, "learning_rate": 3.8401954907019424e-05, "loss": 1.898, "step": 405 }, { "epoch": 0.9709887507400828, "grad_norm": 0.9446001648902893, "learning_rate": 3.813907843920675e-05, "loss": 2.1288, "step": 410 }, { "epoch": 0.9828300769686205, "grad_norm": 1.2340161800384521, "learning_rate": 3.787417927045315e-05, "loss": 1.8449, "step": 415 }, { "epoch": 0.9946714031971581, "grad_norm": 1.0638560056686401, "learning_rate": 3.7607298180772236e-05, "loss": 1.8785, "step": 420 }, { "epoch": 1.0065127294256957, "grad_norm": 1.046241044998169, "learning_rate": 3.733847625528529e-05, "loss": 1.801, "step": 425 }, { "epoch": 1.0183540556542332, "grad_norm": 0.9987891316413879, "learning_rate": 3.706775487789639e-05, "loss": 1.9475, "step": 430 }, { "epoch": 1.030195381882771, "grad_norm": 1.4792598485946655, "learning_rate": 3.679517572492151e-05, "loss": 1.9996, "step": 435 }, { "epoch": 1.0420367081113084, "grad_norm": 1.3502384424209595, "learning_rate": 3.652078075867267e-05, "loss": 1.9598, "step": 440 }, { "epoch": 1.0538780343398462, "grad_norm": 0.9646159410476685, "learning_rate": 3.624461222099804e-05, "loss": 1.9598, "step": 445 }, { "epoch": 1.0657193605683837, "grad_norm": 1.464591145515442, "learning_rate": 3.596671262677898e-05, "loss": 1.9089, "step": 450 }, { "epoch": 1.0775606867969212, "grad_norm": 0.9556955099105835, "learning_rate": 3.568712475738508e-05, "loss": 1.9496, "step": 455 }, { "epoch": 1.089402013025459, "grad_norm": 1.2917728424072266, "learning_rate": 3.5405891654088154e-05, "loss": 1.9467, "step": 460 }, { "epoch": 1.1012433392539964, "grad_norm": 1.3475714921951294, "learning_rate": 3.5123056611436224e-05, "loss": 2.0235, "step": 465 }, { "epoch": 1.1130846654825342, "grad_norm": 0.9648019671440125, "learning_rate": 3.483866317058857e-05, "loss": 1.8844, "step": 470 }, { "epoch": 1.1249259917110717, "grad_norm": 1.3338149785995483, "learning_rate": 3.4552755112612714e-05, "loss": 1.8287, "step": 475 }, { "epoch": 1.1367673179396092, "grad_norm": 1.1770730018615723, "learning_rate": 3.4265376451744565e-05, "loss": 2.0754, "step": 480 }, { "epoch": 1.148608644168147, "grad_norm": 1.0296565294265747, "learning_rate": 3.397657142861258e-05, "loss": 1.9269, "step": 485 }, { "epoch": 1.1604499703966844, "grad_norm": 1.1220016479492188, "learning_rate": 3.3686384503427174e-05, "loss": 2.0613, "step": 490 }, { "epoch": 1.1722912966252221, "grad_norm": 1.0247845649719238, "learning_rate": 3.339486034913627e-05, "loss": 1.8047, "step": 495 }, { "epoch": 1.1841326228537596, "grad_norm": 0.9551325440406799, "learning_rate": 3.3102043844548044e-05, "loss": 1.8639, "step": 500 }, { "epoch": 1.1959739490822971, "grad_norm": 1.7456352710723877, "learning_rate": 3.280798006742213e-05, "loss": 1.9545, "step": 505 }, { "epoch": 1.2078152753108349, "grad_norm": 1.260713815689087, "learning_rate": 3.2512714287530006e-05, "loss": 1.874, "step": 510 }, { "epoch": 1.2196566015393724, "grad_norm": 1.4492847919464111, "learning_rate": 3.2216291959686006e-05, "loss": 1.9178, "step": 515 }, { "epoch": 1.2314979277679101, "grad_norm": 1.4412329196929932, "learning_rate": 3.191875871674971e-05, "loss": 1.919, "step": 520 }, { "epoch": 1.2433392539964476, "grad_norm": 1.2013176679611206, "learning_rate": 3.1620160362600984e-05, "loss": 1.9128, "step": 525 }, { "epoch": 1.2551805802249851, "grad_norm": 1.0584838390350342, "learning_rate": 3.1320542865088696e-05, "loss": 2.0132, "step": 530 }, { "epoch": 1.2670219064535229, "grad_norm": 0.9713219404220581, "learning_rate": 3.101995234895416e-05, "loss": 1.9014, "step": 535 }, { "epoch": 1.2788632326820604, "grad_norm": 1.167724609375, "learning_rate": 3.071843508873046e-05, "loss": 1.8482, "step": 540 }, { "epoch": 1.290704558910598, "grad_norm": 1.43012273311615, "learning_rate": 3.0416037501618677e-05, "loss": 1.7475, "step": 545 }, { "epoch": 1.3025458851391356, "grad_norm": 1.8622283935546875, "learning_rate": 3.0112806140342176e-05, "loss": 2.0185, "step": 550 }, { "epoch": 1.3143872113676731, "grad_norm": 1.513700008392334, "learning_rate": 2.9808787685980054e-05, "loss": 1.8825, "step": 555 }, { "epoch": 1.3262285375962108, "grad_norm": 1.4935449361801147, "learning_rate": 2.9504028940780776e-05, "loss": 1.8319, "step": 560 }, { "epoch": 1.3380698638247484, "grad_norm": 1.4678127765655518, "learning_rate": 2.9198576820957187e-05, "loss": 1.9613, "step": 565 }, { "epoch": 1.349911190053286, "grad_norm": 1.2753045558929443, "learning_rate": 2.8892478349463986e-05, "loss": 1.9367, "step": 570 }, { "epoch": 1.3617525162818236, "grad_norm": 1.5118064880371094, "learning_rate": 2.858578064875874e-05, "loss": 1.895, "step": 575 }, { "epoch": 1.373593842510361, "grad_norm": 1.3416668176651, "learning_rate": 2.8278530933547624e-05, "loss": 2.0023, "step": 580 }, { "epoch": 1.3854351687388988, "grad_norm": 1.6278949975967407, "learning_rate": 2.79707765035169e-05, "loss": 1.9469, "step": 585 }, { "epoch": 1.3972764949674363, "grad_norm": 1.2708017826080322, "learning_rate": 2.7662564736051377e-05, "loss": 1.7915, "step": 590 }, { "epoch": 1.409117821195974, "grad_norm": 1.0602233409881592, "learning_rate": 2.7353943078940875e-05, "loss": 1.8253, "step": 595 }, { "epoch": 1.4209591474245116, "grad_norm": 1.3578535318374634, "learning_rate": 2.7044959043075814e-05, "loss": 2.0741, "step": 600 }, { "epoch": 1.432800473653049, "grad_norm": 1.1489042043685913, "learning_rate": 2.67356601951332e-05, "loss": 1.901, "step": 605 }, { "epoch": 1.4446417998815868, "grad_norm": 1.680655598640442, "learning_rate": 2.64260941502539e-05, "loss": 2.0099, "step": 610 }, { "epoch": 1.4564831261101243, "grad_norm": 1.7020906209945679, "learning_rate": 2.611630856471252e-05, "loss": 1.8853, "step": 615 }, { "epoch": 1.468324452338662, "grad_norm": 0.9535301923751831, "learning_rate": 2.5806351128580964e-05, "loss": 1.8205, "step": 620 }, { "epoch": 1.4801657785671996, "grad_norm": 1.0446418523788452, "learning_rate": 2.5496269558386725e-05, "loss": 2.0851, "step": 625 }, { "epoch": 1.492007104795737, "grad_norm": 1.1212449073791504, "learning_rate": 2.5186111589767187e-05, "loss": 2.0913, "step": 630 }, { "epoch": 1.5038484310242746, "grad_norm": 1.5539659261703491, "learning_rate": 2.487592497012089e-05, "loss": 1.9521, "step": 635 }, { "epoch": 1.5156897572528123, "grad_norm": 1.3954375982284546, "learning_rate": 2.4565757451257128e-05, "loss": 1.8525, "step": 640 }, { "epoch": 1.52753108348135, "grad_norm": 1.5919753313064575, "learning_rate": 2.4255656782044644e-05, "loss": 1.8034, "step": 645 }, { "epoch": 1.5393724097098875, "grad_norm": 1.420255184173584, "learning_rate": 2.3945670701061033e-05, "loss": 2.053, "step": 650 }, { "epoch": 1.551213735938425, "grad_norm": 1.0267295837402344, "learning_rate": 2.3635846929243537e-05, "loss": 1.7877, "step": 655 }, { "epoch": 1.5630550621669625, "grad_norm": 1.778398036956787, "learning_rate": 2.3326233162542655e-05, "loss": 1.9865, "step": 660 }, { "epoch": 1.5748963883955003, "grad_norm": 1.0454083681106567, "learning_rate": 2.3016877064579564e-05, "loss": 1.8799, "step": 665 }, { "epoch": 1.586737714624038, "grad_norm": 1.4005528688430786, "learning_rate": 2.2707826259308492e-05, "loss": 1.9329, "step": 670 }, { "epoch": 1.5985790408525755, "grad_norm": 1.240485429763794, "learning_rate": 2.2399128323685286e-05, "loss": 1.7828, "step": 675 }, { "epoch": 1.610420367081113, "grad_norm": 1.2078522443771362, "learning_rate": 2.2090830780343113e-05, "loss": 2.0114, "step": 680 }, { "epoch": 1.6222616933096505, "grad_norm": 1.1760393381118774, "learning_rate": 2.1782981090276585e-05, "loss": 1.8671, "step": 685 }, { "epoch": 1.6341030195381883, "grad_norm": 1.362109661102295, "learning_rate": 2.147562664553537e-05, "loss": 1.9453, "step": 690 }, { "epoch": 1.645944345766726, "grad_norm": 2.057487964630127, "learning_rate": 2.1168814761928336e-05, "loss": 1.7151, "step": 695 }, { "epoch": 1.6577856719952635, "grad_norm": 1.5289130210876465, "learning_rate": 2.0862592671739608e-05, "loss": 1.9591, "step": 700 }, { "epoch": 1.669626998223801, "grad_norm": 1.42037832736969, "learning_rate": 2.0557007516457288e-05, "loss": 1.8999, "step": 705 }, { "epoch": 1.6814683244523385, "grad_norm": 1.101355791091919, "learning_rate": 2.0252106339516272e-05, "loss": 2.0037, "step": 710 }, { "epoch": 1.6933096506808762, "grad_norm": 5.125020980834961, "learning_rate": 1.9947936079056117e-05, "loss": 1.7028, "step": 715 }, { "epoch": 1.705150976909414, "grad_norm": 1.3062840700149536, "learning_rate": 1.964454356069514e-05, "loss": 2.1283, "step": 720 }, { "epoch": 1.7169923031379515, "grad_norm": 1.273743987083435, "learning_rate": 1.9341975490321827e-05, "loss": 2.0017, "step": 725 }, { "epoch": 1.728833629366489, "grad_norm": 1.3251115083694458, "learning_rate": 1.9040278446904677e-05, "loss": 1.7329, "step": 730 }, { "epoch": 1.7406749555950265, "grad_norm": 1.5570122003555298, "learning_rate": 1.873949887532156e-05, "loss": 1.9965, "step": 735 }, { "epoch": 1.7525162818235642, "grad_norm": 1.3525424003601074, "learning_rate": 1.8439683079209787e-05, "loss": 1.9321, "step": 740 }, { "epoch": 1.764357608052102, "grad_norm": 1.776936411857605, "learning_rate": 1.8140877213837823e-05, "loss": 1.9739, "step": 745 }, { "epoch": 1.7761989342806395, "grad_norm": 1.317015290260315, "learning_rate": 1.7843127278999943e-05, "loss": 1.8864, "step": 750 }, { "epoch": 1.788040260509177, "grad_norm": 1.6961033344268799, "learning_rate": 1.754647911193473e-05, "loss": 1.9227, "step": 755 }, { "epoch": 1.7998815867377145, "grad_norm": 1.448065161705017, "learning_rate": 1.7250978380268694e-05, "loss": 2.0025, "step": 760 }, { "epoch": 1.8117229129662522, "grad_norm": 1.4134864807128906, "learning_rate": 1.6956670574985908e-05, "loss": 1.8039, "step": 765 }, { "epoch": 1.82356423919479, "grad_norm": 1.3354402780532837, "learning_rate": 1.6663601003424883e-05, "loss": 1.8008, "step": 770 }, { "epoch": 1.8354055654233274, "grad_norm": 1.2575969696044922, "learning_rate": 1.6371814782303722e-05, "loss": 1.9609, "step": 775 }, { "epoch": 1.847246891651865, "grad_norm": 1.4361883401870728, "learning_rate": 1.6081356830774625e-05, "loss": 2.0843, "step": 780 }, { "epoch": 1.8590882178804025, "grad_norm": 1.320430874824524, "learning_rate": 1.579227186350875e-05, "loss": 1.8281, "step": 785 }, { "epoch": 1.8709295441089402, "grad_norm": 1.2178723812103271, "learning_rate": 1.5504604383812646e-05, "loss": 1.9628, "step": 790 }, { "epoch": 1.882770870337478, "grad_norm": 1.3220632076263428, "learning_rate": 1.5218398676777102e-05, "loss": 1.9867, "step": 795 }, { "epoch": 1.8946121965660154, "grad_norm": 0.9402835965156555, "learning_rate": 1.4933698802459731e-05, "loss": 1.8819, "step": 800 }, { "epoch": 1.906453522794553, "grad_norm": 1.3736234903335571, "learning_rate": 1.4650548589102092e-05, "loss": 1.8938, "step": 805 }, { "epoch": 1.9182948490230904, "grad_norm": 0.9985255002975464, "learning_rate": 1.436899162638255e-05, "loss": 1.6936, "step": 810 }, { "epoch": 1.9301361752516282, "grad_norm": 1.5471998453140259, "learning_rate": 1.4089071258705783e-05, "loss": 2.1922, "step": 815 }, { "epoch": 1.941977501480166, "grad_norm": 1.515426754951477, "learning_rate": 1.3810830578530225e-05, "loss": 1.8547, "step": 820 }, { "epoch": 1.9538188277087034, "grad_norm": 1.2444261312484741, "learning_rate": 1.3534312419734066e-05, "loss": 1.9437, "step": 825 }, { "epoch": 1.965660153937241, "grad_norm": 1.3579230308532715, "learning_rate": 1.3259559351021247e-05, "loss": 1.7016, "step": 830 }, { "epoch": 1.9775014801657784, "grad_norm": 1.436259388923645, "learning_rate": 1.2986613669368158e-05, "loss": 1.8801, "step": 835 }, { "epoch": 1.9893428063943162, "grad_norm": 1.196303367614746, "learning_rate": 1.271551739351224e-05, "loss": 1.9408, "step": 840 }, { "epoch": 2.001184132622854, "grad_norm": 1.1158932447433472, "learning_rate": 1.2446312257483358e-05, "loss": 1.6631, "step": 845 }, { "epoch": 2.0130254588513914, "grad_norm": 1.18464195728302, "learning_rate": 1.2179039704179118e-05, "loss": 1.859, "step": 850 }, { "epoch": 2.024866785079929, "grad_norm": 1.7019284963607788, "learning_rate": 1.1913740878984816e-05, "loss": 1.9671, "step": 855 }, { "epoch": 2.0367081113084664, "grad_norm": 1.4283084869384766, "learning_rate": 1.1650456623439367e-05, "loss": 1.8109, "step": 860 }, { "epoch": 2.0485494375370044, "grad_norm": 1.2812663316726685, "learning_rate": 1.1389227468947906e-05, "loss": 1.8024, "step": 865 }, { "epoch": 2.060390763765542, "grad_norm": 1.4143946170806885, "learning_rate": 1.1130093630542198e-05, "loss": 1.8327, "step": 870 }, { "epoch": 2.0722320899940794, "grad_norm": 1.2379989624023438, "learning_rate": 1.0873095000689675e-05, "loss": 1.9797, "step": 875 }, { "epoch": 2.084073416222617, "grad_norm": 1.524370551109314, "learning_rate": 1.0618271143152184e-05, "loss": 1.8164, "step": 880 }, { "epoch": 2.0959147424511544, "grad_norm": 1.7849695682525635, "learning_rate": 1.0365661286895365e-05, "loss": 1.7994, "step": 885 }, { "epoch": 2.1077560686796923, "grad_norm": 2.3028218746185303, "learning_rate": 1.0115304320049479e-05, "loss": 1.9428, "step": 890 }, { "epoch": 2.11959739490823, "grad_norm": 1.4404668807983398, "learning_rate": 9.867238783922789e-06, "loss": 1.9297, "step": 895 }, { "epoch": 2.1314387211367674, "grad_norm": 2.096019744873047, "learning_rate": 9.621502867068285e-06, "loss": 1.8185, "step": 900 }, { "epoch": 2.143280047365305, "grad_norm": 1.393315076828003, "learning_rate": 9.378134399404767e-06, "loss": 1.7902, "step": 905 }, { "epoch": 2.1551213735938424, "grad_norm": 1.4423267841339111, "learning_rate": 9.137170846393054e-06, "loss": 1.7111, "step": 910 }, { "epoch": 2.1669626998223803, "grad_norm": 1.1944553852081299, "learning_rate": 8.898649303268372e-06, "loss": 2.04, "step": 915 }, { "epoch": 2.178804026050918, "grad_norm": 1.407127857208252, "learning_rate": 8.662606489329711e-06, "loss": 1.9046, "step": 920 }, { "epoch": 2.1906453522794553, "grad_norm": 1.855188012123108, "learning_rate": 8.429078742287073e-06, "loss": 2.1426, "step": 925 }, { "epoch": 2.202486678507993, "grad_norm": 1.5526237487792969, "learning_rate": 8.198102012667407e-06, "loss": 2.0743, "step": 930 }, { "epoch": 2.2143280047365304, "grad_norm": 1.1409013271331787, "learning_rate": 7.969711858280252e-06, "loss": 1.7135, "step": 935 }, { "epoch": 2.2261693309650683, "grad_norm": 1.7926199436187744, "learning_rate": 7.743943438743676e-06, "loss": 1.8054, "step": 940 }, { "epoch": 2.238010657193606, "grad_norm": 2.43799090385437, "learning_rate": 7.520831510071744e-06, "loss": 1.8244, "step": 945 }, { "epoch": 2.2498519834221433, "grad_norm": 1.3174521923065186, "learning_rate": 7.300410419323869e-06, "loss": 1.8097, "step": 950 }, { "epoch": 2.261693309650681, "grad_norm": 1.5044059753417969, "learning_rate": 7.082714099317334e-06, "loss": 1.9919, "step": 955 }, { "epoch": 2.2735346358792183, "grad_norm": 1.3440165519714355, "learning_rate": 6.867776063403411e-06, "loss": 1.9084, "step": 960 }, { "epoch": 2.2853759621077563, "grad_norm": 1.5049198865890503, "learning_rate": 6.6556294003081914e-06, "loss": 1.7835, "step": 965 }, { "epoch": 2.297217288336294, "grad_norm": 1.3639992475509644, "learning_rate": 6.44630676903869e-06, "loss": 1.7542, "step": 970 }, { "epoch": 2.3090586145648313, "grad_norm": 1.5013291835784912, "learning_rate": 6.239840393855184e-06, "loss": 2.0351, "step": 975 }, { "epoch": 2.320899940793369, "grad_norm": 1.4973437786102295, "learning_rate": 6.036262059310382e-06, "loss": 1.7908, "step": 980 }, { "epoch": 2.3327412670219063, "grad_norm": 2.3206512928009033, "learning_rate": 5.835603105356396e-06, "loss": 1.7511, "step": 985 }, { "epoch": 2.3445825932504443, "grad_norm": 1.5731827020645142, "learning_rate": 5.637894422520027e-06, "loss": 1.9621, "step": 990 }, { "epoch": 2.3564239194789818, "grad_norm": 1.6597813367843628, "learning_rate": 5.443166447147391e-06, "loss": 2.0578, "step": 995 }, { "epoch": 2.3682652457075193, "grad_norm": 1.659856915473938, "learning_rate": 5.251449156718313e-06, "loss": 1.9198, "step": 1000 }, { "epoch": 2.380106571936057, "grad_norm": 1.1122225522994995, "learning_rate": 5.062772065231491e-06, "loss": 1.6089, "step": 1005 }, { "epoch": 2.3919478981645943, "grad_norm": 1.6533218622207642, "learning_rate": 4.877164218660901e-06, "loss": 1.9499, "step": 1010 }, { "epoch": 2.4037892243931323, "grad_norm": 1.2786110639572144, "learning_rate": 4.694654190484327e-06, "loss": 2.0044, "step": 1015 }, { "epoch": 2.4156305506216698, "grad_norm": 1.542189359664917, "learning_rate": 4.515270077284595e-06, "loss": 2.0896, "step": 1020 }, { "epoch": 2.4274718768502073, "grad_norm": 1.5293947458267212, "learning_rate": 4.339039494424263e-06, "loss": 1.8803, "step": 1025 }, { "epoch": 2.4393132030787448, "grad_norm": 1.390453815460205, "learning_rate": 4.16598957179431e-06, "loss": 2.0795, "step": 1030 }, { "epoch": 2.4511545293072823, "grad_norm": 2.7319912910461426, "learning_rate": 3.996146949637658e-06, "loss": 1.7221, "step": 1035 }, { "epoch": 2.4629958555358202, "grad_norm": 1.080298900604248, "learning_rate": 3.8295377744479995e-06, "loss": 1.6881, "step": 1040 }, { "epoch": 2.4748371817643577, "grad_norm": 1.4576321840286255, "learning_rate": 3.6661876949447007e-06, "loss": 1.8039, "step": 1045 }, { "epoch": 2.4866785079928952, "grad_norm": 1.5691012144088745, "learning_rate": 3.5061218581242535e-06, "loss": 1.9333, "step": 1050 }, { "epoch": 2.4985198342214328, "grad_norm": 1.2626398801803589, "learning_rate": 3.3493649053890326e-06, "loss": 1.6731, "step": 1055 }, { "epoch": 2.5103611604499703, "grad_norm": 1.5094714164733887, "learning_rate": 3.1959409687538853e-06, "loss": 1.8437, "step": 1060 }, { "epoch": 2.522202486678508, "grad_norm": 2.1249232292175293, "learning_rate": 3.04587366713108e-06, "loss": 1.9577, "step": 1065 }, { "epoch": 2.5340438129070457, "grad_norm": 1.563010334968567, "learning_rate": 2.8991861026943014e-06, "loss": 2.0162, "step": 1070 }, { "epoch": 2.5458851391355832, "grad_norm": 2.769664764404297, "learning_rate": 2.7559008573221717e-06, "loss": 1.8091, "step": 1075 }, { "epoch": 2.5577264653641207, "grad_norm": 1.178745985031128, "learning_rate": 2.6160399891218988e-06, "loss": 1.7485, "step": 1080 }, { "epoch": 2.5695677915926582, "grad_norm": 1.479054570198059, "learning_rate": 2.4796250290334887e-06, "loss": 1.9556, "step": 1085 }, { "epoch": 2.581409117821196, "grad_norm": 1.1433284282684326, "learning_rate": 2.346676977515189e-06, "loss": 1.7762, "step": 1090 }, { "epoch": 2.5932504440497337, "grad_norm": 1.6979899406433105, "learning_rate": 2.21721630131054e-06, "loss": 1.7463, "step": 1095 }, { "epoch": 2.605091770278271, "grad_norm": 2.4911599159240723, "learning_rate": 2.0912629302976493e-06, "loss": 1.9825, "step": 1100 }, { "epoch": 2.6169330965068087, "grad_norm": 1.3858684301376343, "learning_rate": 1.968836254421036e-06, "loss": 1.8317, "step": 1105 }, { "epoch": 2.6287744227353462, "grad_norm": 1.304112195968628, "learning_rate": 1.849955120706673e-06, "loss": 1.7377, "step": 1110 }, { "epoch": 2.640615748963884, "grad_norm": 1.253029465675354, "learning_rate": 1.7346378303605359e-06, "loss": 1.8165, "step": 1115 }, { "epoch": 2.6524570751924217, "grad_norm": 1.166486382484436, "learning_rate": 1.6229021359512624e-06, "loss": 1.8233, "step": 1120 }, { "epoch": 2.664298401420959, "grad_norm": 1.5361779928207397, "learning_rate": 1.5147652386771848e-06, "loss": 1.9029, "step": 1125 }, { "epoch": 2.6761397276494967, "grad_norm": 1.3075627088546753, "learning_rate": 1.4102437857183155e-06, "loss": 1.8527, "step": 1130 }, { "epoch": 2.687981053878034, "grad_norm": 1.133543610572815, "learning_rate": 1.3093538676735601e-06, "loss": 1.8855, "step": 1135 }, { "epoch": 2.699822380106572, "grad_norm": 1.6549872159957886, "learning_rate": 1.2121110160836696e-06, "loss": 1.8746, "step": 1140 }, { "epoch": 2.7116637063351097, "grad_norm": 1.2035839557647705, "learning_rate": 1.1185302010402105e-06, "loss": 1.696, "step": 1145 }, { "epoch": 2.723505032563647, "grad_norm": 1.338542103767395, "learning_rate": 1.0286258288810107e-06, "loss": 1.7904, "step": 1150 }, { "epoch": 2.7353463587921847, "grad_norm": 1.4907745122909546, "learning_rate": 9.424117399723431e-07, "loss": 1.962, "step": 1155 }, { "epoch": 2.747187685020722, "grad_norm": 1.6321779489517212, "learning_rate": 8.599012065782924e-07, "loss": 1.9262, "step": 1160 }, { "epoch": 2.75902901124926, "grad_norm": 1.2635369300842285, "learning_rate": 7.811069308175156e-07, "loss": 1.8789, "step": 1165 }, { "epoch": 2.7708703374777977, "grad_norm": 1.5785208940505981, "learning_rate": 7.060410427078473e-07, "loss": 1.7546, "step": 1170 }, { "epoch": 2.782711663706335, "grad_norm": 1.1907869577407837, "learning_rate": 6.347150982989159e-07, "loss": 1.7979, "step": 1175 }, { "epoch": 2.7945529899348727, "grad_norm": 1.3581962585449219, "learning_rate": 5.671400778931468e-07, "loss": 1.7874, "step": 1180 }, { "epoch": 2.80639431616341, "grad_norm": 1.0370872020721436, "learning_rate": 5.033263843554015e-07, "loss": 1.6197, "step": 1185 }, { "epoch": 2.818235642391948, "grad_norm": 1.2103910446166992, "learning_rate": 4.4328384151149095e-07, "loss": 1.5225, "step": 1190 }, { "epoch": 2.8300769686204856, "grad_norm": 1.4887464046478271, "learning_rate": 3.8702169263585554e-07, "loss": 1.9811, "step": 1195 }, { "epoch": 2.841918294849023, "grad_norm": 1.4641401767730713, "learning_rate": 3.345485990286029e-07, "loss": 1.9999, "step": 1200 } ], "logging_steps": 5, "max_steps": 1266, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 5.69903990243328e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }