|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9982238010657194, |
|
"eval_steps": 500, |
|
"global_step": 1266, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.011841326228537596, |
|
"grad_norm": 0.6710847020149231, |
|
"learning_rate": 4.9998075682257415e-05, |
|
"loss": 3.876, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.023682652457075192, |
|
"grad_norm": 0.8348745107650757, |
|
"learning_rate": 4.9992303025269555e-05, |
|
"loss": 3.8538, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.035523978685612786, |
|
"grad_norm": 2.387429714202881, |
|
"learning_rate": 4.9982682917710524e-05, |
|
"loss": 3.5742, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.047365304914150384, |
|
"grad_norm": 1.0946344137191772, |
|
"learning_rate": 4.996921684055182e-05, |
|
"loss": 3.5193, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05920663114268798, |
|
"grad_norm": 1.1773927211761475, |
|
"learning_rate": 4.9951906866834316e-05, |
|
"loss": 2.9465, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.07104795737122557, |
|
"grad_norm": 1.147207260131836, |
|
"learning_rate": 4.993075566134921e-05, |
|
"loss": 3.0208, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08288928359976318, |
|
"grad_norm": 1.8781776428222656, |
|
"learning_rate": 4.990576648022768e-05, |
|
"loss": 2.7173, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.09473060982830077, |
|
"grad_norm": 1.009155511856079, |
|
"learning_rate": 4.987694317043969e-05, |
|
"loss": 2.6235, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.10657193605683836, |
|
"grad_norm": 0.9919832944869995, |
|
"learning_rate": 4.984429016920178e-05, |
|
"loss": 2.4021, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.11841326228537596, |
|
"grad_norm": 0.8563632965087891, |
|
"learning_rate": 4.980781250329389e-05, |
|
"loss": 2.1688, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.13025458851391356, |
|
"grad_norm": 0.7374653816223145, |
|
"learning_rate": 4.976751578828562e-05, |
|
"loss": 2.2943, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.14209591474245115, |
|
"grad_norm": 0.9438661336898804, |
|
"learning_rate": 4.9723406227671643e-05, |
|
"loss": 1.9571, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.15393724097098876, |
|
"grad_norm": 0.9622551202774048, |
|
"learning_rate": 4.967549061191679e-05, |
|
"loss": 2.1024, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.16577856719952636, |
|
"grad_norm": 0.6480829119682312, |
|
"learning_rate": 4.96237763174106e-05, |
|
"loss": 1.9145, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.17761989342806395, |
|
"grad_norm": 1.2615852355957031, |
|
"learning_rate": 4.956827130533185e-05, |
|
"loss": 2.1472, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.18946121965660154, |
|
"grad_norm": 1.0979194641113281, |
|
"learning_rate": 4.95089841204229e-05, |
|
"loss": 2.0441, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.20130254588513913, |
|
"grad_norm": 0.8610992431640625, |
|
"learning_rate": 4.944592388967428e-05, |
|
"loss": 2.1605, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.21314387211367672, |
|
"grad_norm": 1.0579572916030884, |
|
"learning_rate": 4.937910032091968e-05, |
|
"loss": 2.2801, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.22498519834221434, |
|
"grad_norm": 1.0034611225128174, |
|
"learning_rate": 4.930852370134141e-05, |
|
"loss": 2.2327, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.23682652457075193, |
|
"grad_norm": 0.8670480847358704, |
|
"learning_rate": 4.923420489588677e-05, |
|
"loss": 2.0031, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.24866785079928952, |
|
"grad_norm": 0.8441548347473145, |
|
"learning_rate": 4.9156155345595445e-05, |
|
"loss": 1.8637, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.2605091770278271, |
|
"grad_norm": 0.8399316072463989, |
|
"learning_rate": 4.907438706583818e-05, |
|
"loss": 1.9813, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.27235050325636473, |
|
"grad_norm": 0.9002792835235596, |
|
"learning_rate": 4.898891264446709e-05, |
|
"loss": 2.1698, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.2841918294849023, |
|
"grad_norm": 1.1735421419143677, |
|
"learning_rate": 4.8899745239877845e-05, |
|
"loss": 2.1691, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2960331557134399, |
|
"grad_norm": 1.2177324295043945, |
|
"learning_rate": 4.880689857898392e-05, |
|
"loss": 2.1437, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.30787448194197753, |
|
"grad_norm": 1.062441349029541, |
|
"learning_rate": 4.871038695510347e-05, |
|
"loss": 2.078, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3197158081705151, |
|
"grad_norm": 0.8719960451126099, |
|
"learning_rate": 4.861022522575892e-05, |
|
"loss": 2.0338, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.3315571343990527, |
|
"grad_norm": 1.042143702507019, |
|
"learning_rate": 4.8506428810389696e-05, |
|
"loss": 2.1387, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3433984606275903, |
|
"grad_norm": 0.8554458022117615, |
|
"learning_rate": 4.839901368797849e-05, |
|
"loss": 2.0853, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.3552397868561279, |
|
"grad_norm": 1.033553957939148, |
|
"learning_rate": 4.828799639459138e-05, |
|
"loss": 1.9919, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.36708111308466546, |
|
"grad_norm": 0.7688170075416565, |
|
"learning_rate": 4.8173394020832164e-05, |
|
"loss": 2.1078, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.3789224393132031, |
|
"grad_norm": 0.9199577569961548, |
|
"learning_rate": 4.8055224209211316e-05, |
|
"loss": 1.9385, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3907637655417407, |
|
"grad_norm": 1.0168030261993408, |
|
"learning_rate": 4.793350515143007e-05, |
|
"loss": 1.8268, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.40260509177027826, |
|
"grad_norm": 1.2735998630523682, |
|
"learning_rate": 4.780825558557981e-05, |
|
"loss": 1.8595, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4144464179988159, |
|
"grad_norm": 0.7815925478935242, |
|
"learning_rate": 4.767949479325748e-05, |
|
"loss": 2.005, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.42628774422735344, |
|
"grad_norm": 0.9479308724403381, |
|
"learning_rate": 4.7547242596597274e-05, |
|
"loss": 2.0088, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.43812907045589106, |
|
"grad_norm": 1.5356885194778442, |
|
"learning_rate": 4.7411519355219066e-05, |
|
"loss": 2.1591, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.4499703966844287, |
|
"grad_norm": 1.3683360815048218, |
|
"learning_rate": 4.727234596309417e-05, |
|
"loss": 1.9772, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.46181172291296624, |
|
"grad_norm": 1.2764394283294678, |
|
"learning_rate": 4.71297438453288e-05, |
|
"loss": 1.9476, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.47365304914150386, |
|
"grad_norm": 1.4485411643981934, |
|
"learning_rate": 4.698373495486579e-05, |
|
"loss": 2.0495, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4854943753700414, |
|
"grad_norm": 0.9207140207290649, |
|
"learning_rate": 4.683434176910503e-05, |
|
"loss": 2.0267, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.49733570159857904, |
|
"grad_norm": 1.0169957876205444, |
|
"learning_rate": 4.6681587286443146e-05, |
|
"loss": 1.8998, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5091770278271166, |
|
"grad_norm": 1.0399326086044312, |
|
"learning_rate": 4.652549502273304e-05, |
|
"loss": 1.9095, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.5210183540556542, |
|
"grad_norm": 1.6732031106948853, |
|
"learning_rate": 4.636608900766372e-05, |
|
"loss": 1.935, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5328596802841918, |
|
"grad_norm": 1.1295055150985718, |
|
"learning_rate": 4.620339378106102e-05, |
|
"loss": 2.084, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5447010065127295, |
|
"grad_norm": 1.196055293083191, |
|
"learning_rate": 4.603743438910986e-05, |
|
"loss": 1.9387, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5565423327412671, |
|
"grad_norm": 1.519239902496338, |
|
"learning_rate": 4.586823638049841e-05, |
|
"loss": 1.871, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.5683836589698046, |
|
"grad_norm": 1.1272022724151611, |
|
"learning_rate": 4.5695825802485085e-05, |
|
"loss": 1.9629, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5802249851983422, |
|
"grad_norm": 1.135977864265442, |
|
"learning_rate": 4.552022919688861e-05, |
|
"loss": 2.1182, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.5920663114268798, |
|
"grad_norm": 1.3236933946609497, |
|
"learning_rate": 4.53414735960021e-05, |
|
"loss": 1.8874, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6039076376554174, |
|
"grad_norm": 2.230280637741089, |
|
"learning_rate": 4.51595865184315e-05, |
|
"loss": 1.8281, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.6157489638839551, |
|
"grad_norm": 1.3642871379852295, |
|
"learning_rate": 4.497459596485924e-05, |
|
"loss": 2.0387, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6275902901124926, |
|
"grad_norm": 0.9770427942276001, |
|
"learning_rate": 4.47865304137337e-05, |
|
"loss": 2.1307, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.6394316163410302, |
|
"grad_norm": 3.981346845626831, |
|
"learning_rate": 4.4595418816885004e-05, |
|
"loss": 2.0755, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6512729425695678, |
|
"grad_norm": 1.1023027896881104, |
|
"learning_rate": 4.440129059506808e-05, |
|
"loss": 1.9149, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.6631142687981054, |
|
"grad_norm": 1.648698091506958, |
|
"learning_rate": 4.420417563343346e-05, |
|
"loss": 1.984, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6749555950266429, |
|
"grad_norm": 1.157861590385437, |
|
"learning_rate": 4.40041042769266e-05, |
|
"loss": 1.8697, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.6867969212551805, |
|
"grad_norm": 1.0009181499481201, |
|
"learning_rate": 4.380110732561637e-05, |
|
"loss": 1.7951, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.6986382474837182, |
|
"grad_norm": 2.11307692527771, |
|
"learning_rate": 4.3595216029953575e-05, |
|
"loss": 1.6276, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.7104795737122558, |
|
"grad_norm": 1.0800515413284302, |
|
"learning_rate": 4.3386462085960086e-05, |
|
"loss": 1.8438, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7223208999407934, |
|
"grad_norm": 0.9078517556190491, |
|
"learning_rate": 4.3174877630349366e-05, |
|
"loss": 1.7839, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.7341622261693309, |
|
"grad_norm": 1.0842117071151733, |
|
"learning_rate": 4.296049523557917e-05, |
|
"loss": 1.8604, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7460035523978685, |
|
"grad_norm": 1.2695246934890747, |
|
"learning_rate": 4.2743347904837176e-05, |
|
"loss": 1.9514, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.7578448786264061, |
|
"grad_norm": 1.0501705408096313, |
|
"learning_rate": 4.2523469066960295e-05, |
|
"loss": 1.9439, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7696862048549438, |
|
"grad_norm": 1.3608386516571045, |
|
"learning_rate": 4.230089257128842e-05, |
|
"loss": 2.0266, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.7815275310834814, |
|
"grad_norm": 1.1087582111358643, |
|
"learning_rate": 4.2075652682453554e-05, |
|
"loss": 1.9851, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.7933688573120189, |
|
"grad_norm": 1.4247711896896362, |
|
"learning_rate": 4.184778407510484e-05, |
|
"loss": 1.976, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.8052101835405565, |
|
"grad_norm": 1.5246973037719727, |
|
"learning_rate": 4.16173218285706e-05, |
|
"loss": 1.9039, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8170515097690941, |
|
"grad_norm": 0.8650221824645996, |
|
"learning_rate": 4.138430142145805e-05, |
|
"loss": 1.9577, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.8288928359976317, |
|
"grad_norm": 1.5152937173843384, |
|
"learning_rate": 4.114875872619147e-05, |
|
"loss": 1.9182, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8407341622261694, |
|
"grad_norm": 1.2927278280258179, |
|
"learning_rate": 4.0910730003489894e-05, |
|
"loss": 1.9574, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.8525754884547069, |
|
"grad_norm": 1.193464756011963, |
|
"learning_rate": 4.067025189678485e-05, |
|
"loss": 1.9129, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8644168146832445, |
|
"grad_norm": 1.1596301794052124, |
|
"learning_rate": 4.042736142657935e-05, |
|
"loss": 2.0108, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.8762581409117821, |
|
"grad_norm": 1.278910756111145, |
|
"learning_rate": 4.018209598474869e-05, |
|
"loss": 2.1509, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.8880994671403197, |
|
"grad_norm": 0.9644595384597778, |
|
"learning_rate": 3.993449332878418e-05, |
|
"loss": 2.01, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.8999407933688574, |
|
"grad_norm": 1.2030296325683594, |
|
"learning_rate": 3.9684591575980546e-05, |
|
"loss": 2.0312, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9117821195973949, |
|
"grad_norm": 1.4285073280334473, |
|
"learning_rate": 3.943242919756792e-05, |
|
"loss": 1.9705, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.9236234458259325, |
|
"grad_norm": 1.1423532962799072, |
|
"learning_rate": 3.917804501278942e-05, |
|
"loss": 2.1293, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9354647720544701, |
|
"grad_norm": 1.3577874898910522, |
|
"learning_rate": 3.8921478182925055e-05, |
|
"loss": 2.1897, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.9473060982830077, |
|
"grad_norm": 1.166759967803955, |
|
"learning_rate": 3.8662768205263044e-05, |
|
"loss": 1.966, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9591474245115453, |
|
"grad_norm": 0.8779304623603821, |
|
"learning_rate": 3.8401954907019424e-05, |
|
"loss": 1.898, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.9709887507400828, |
|
"grad_norm": 0.9446001648902893, |
|
"learning_rate": 3.813907843920675e-05, |
|
"loss": 2.1288, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.9828300769686205, |
|
"grad_norm": 1.2340161800384521, |
|
"learning_rate": 3.787417927045315e-05, |
|
"loss": 1.8449, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.9946714031971581, |
|
"grad_norm": 1.0638560056686401, |
|
"learning_rate": 3.7607298180772236e-05, |
|
"loss": 1.8785, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.0065127294256957, |
|
"grad_norm": 1.046241044998169, |
|
"learning_rate": 3.733847625528529e-05, |
|
"loss": 1.801, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.0183540556542332, |
|
"grad_norm": 0.9987891316413879, |
|
"learning_rate": 3.706775487789639e-05, |
|
"loss": 1.9475, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.030195381882771, |
|
"grad_norm": 1.4792598485946655, |
|
"learning_rate": 3.679517572492151e-05, |
|
"loss": 1.9996, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.0420367081113084, |
|
"grad_norm": 1.3502384424209595, |
|
"learning_rate": 3.652078075867267e-05, |
|
"loss": 1.9598, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.0538780343398462, |
|
"grad_norm": 0.9646159410476685, |
|
"learning_rate": 3.624461222099804e-05, |
|
"loss": 1.9598, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.0657193605683837, |
|
"grad_norm": 1.464591145515442, |
|
"learning_rate": 3.596671262677898e-05, |
|
"loss": 1.9089, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.0775606867969212, |
|
"grad_norm": 0.9556955099105835, |
|
"learning_rate": 3.568712475738508e-05, |
|
"loss": 1.9496, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.089402013025459, |
|
"grad_norm": 1.2917728424072266, |
|
"learning_rate": 3.5405891654088154e-05, |
|
"loss": 1.9467, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.1012433392539964, |
|
"grad_norm": 1.3475714921951294, |
|
"learning_rate": 3.5123056611436224e-05, |
|
"loss": 2.0235, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.1130846654825342, |
|
"grad_norm": 0.9648019671440125, |
|
"learning_rate": 3.483866317058857e-05, |
|
"loss": 1.8844, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.1249259917110717, |
|
"grad_norm": 1.3338149785995483, |
|
"learning_rate": 3.4552755112612714e-05, |
|
"loss": 1.8287, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.1367673179396092, |
|
"grad_norm": 1.1770730018615723, |
|
"learning_rate": 3.4265376451744565e-05, |
|
"loss": 2.0754, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.148608644168147, |
|
"grad_norm": 1.0296565294265747, |
|
"learning_rate": 3.397657142861258e-05, |
|
"loss": 1.9269, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.1604499703966844, |
|
"grad_norm": 1.1220016479492188, |
|
"learning_rate": 3.3686384503427174e-05, |
|
"loss": 2.0613, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.1722912966252221, |
|
"grad_norm": 1.0247845649719238, |
|
"learning_rate": 3.339486034913627e-05, |
|
"loss": 1.8047, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.1841326228537596, |
|
"grad_norm": 0.9551325440406799, |
|
"learning_rate": 3.3102043844548044e-05, |
|
"loss": 1.8639, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.1959739490822971, |
|
"grad_norm": 1.7456352710723877, |
|
"learning_rate": 3.280798006742213e-05, |
|
"loss": 1.9545, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.2078152753108349, |
|
"grad_norm": 1.260713815689087, |
|
"learning_rate": 3.2512714287530006e-05, |
|
"loss": 1.874, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.2196566015393724, |
|
"grad_norm": 1.4492847919464111, |
|
"learning_rate": 3.2216291959686006e-05, |
|
"loss": 1.9178, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.2314979277679101, |
|
"grad_norm": 1.4412329196929932, |
|
"learning_rate": 3.191875871674971e-05, |
|
"loss": 1.919, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.2433392539964476, |
|
"grad_norm": 1.2013176679611206, |
|
"learning_rate": 3.1620160362600984e-05, |
|
"loss": 1.9128, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.2551805802249851, |
|
"grad_norm": 1.0584838390350342, |
|
"learning_rate": 3.1320542865088696e-05, |
|
"loss": 2.0132, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.2670219064535229, |
|
"grad_norm": 0.9713219404220581, |
|
"learning_rate": 3.101995234895416e-05, |
|
"loss": 1.9014, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.2788632326820604, |
|
"grad_norm": 1.167724609375, |
|
"learning_rate": 3.071843508873046e-05, |
|
"loss": 1.8482, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.290704558910598, |
|
"grad_norm": 1.43012273311615, |
|
"learning_rate": 3.0416037501618677e-05, |
|
"loss": 1.7475, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.3025458851391356, |
|
"grad_norm": 1.8622283935546875, |
|
"learning_rate": 3.0112806140342176e-05, |
|
"loss": 2.0185, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.3143872113676731, |
|
"grad_norm": 1.513700008392334, |
|
"learning_rate": 2.9808787685980054e-05, |
|
"loss": 1.8825, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.3262285375962108, |
|
"grad_norm": 1.4935449361801147, |
|
"learning_rate": 2.9504028940780776e-05, |
|
"loss": 1.8319, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.3380698638247484, |
|
"grad_norm": 1.4678127765655518, |
|
"learning_rate": 2.9198576820957187e-05, |
|
"loss": 1.9613, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.349911190053286, |
|
"grad_norm": 1.2753045558929443, |
|
"learning_rate": 2.8892478349463986e-05, |
|
"loss": 1.9367, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.3617525162818236, |
|
"grad_norm": 1.5118064880371094, |
|
"learning_rate": 2.858578064875874e-05, |
|
"loss": 1.895, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.373593842510361, |
|
"grad_norm": 1.3416668176651, |
|
"learning_rate": 2.8278530933547624e-05, |
|
"loss": 2.0023, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.3854351687388988, |
|
"grad_norm": 1.6278949975967407, |
|
"learning_rate": 2.79707765035169e-05, |
|
"loss": 1.9469, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.3972764949674363, |
|
"grad_norm": 1.2708017826080322, |
|
"learning_rate": 2.7662564736051377e-05, |
|
"loss": 1.7915, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.409117821195974, |
|
"grad_norm": 1.0602233409881592, |
|
"learning_rate": 2.7353943078940875e-05, |
|
"loss": 1.8253, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.4209591474245116, |
|
"grad_norm": 1.3578535318374634, |
|
"learning_rate": 2.7044959043075814e-05, |
|
"loss": 2.0741, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.432800473653049, |
|
"grad_norm": 1.1489042043685913, |
|
"learning_rate": 2.67356601951332e-05, |
|
"loss": 1.901, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.4446417998815868, |
|
"grad_norm": 1.680655598640442, |
|
"learning_rate": 2.64260941502539e-05, |
|
"loss": 2.0099, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.4564831261101243, |
|
"grad_norm": 1.7020906209945679, |
|
"learning_rate": 2.611630856471252e-05, |
|
"loss": 1.8853, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.468324452338662, |
|
"grad_norm": 0.9535301923751831, |
|
"learning_rate": 2.5806351128580964e-05, |
|
"loss": 1.8205, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.4801657785671996, |
|
"grad_norm": 1.0446418523788452, |
|
"learning_rate": 2.5496269558386725e-05, |
|
"loss": 2.0851, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.492007104795737, |
|
"grad_norm": 1.1212449073791504, |
|
"learning_rate": 2.5186111589767187e-05, |
|
"loss": 2.0913, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.5038484310242746, |
|
"grad_norm": 1.5539659261703491, |
|
"learning_rate": 2.487592497012089e-05, |
|
"loss": 1.9521, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.5156897572528123, |
|
"grad_norm": 1.3954375982284546, |
|
"learning_rate": 2.4565757451257128e-05, |
|
"loss": 1.8525, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.52753108348135, |
|
"grad_norm": 1.5919753313064575, |
|
"learning_rate": 2.4255656782044644e-05, |
|
"loss": 1.8034, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.5393724097098875, |
|
"grad_norm": 1.420255184173584, |
|
"learning_rate": 2.3945670701061033e-05, |
|
"loss": 2.053, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.551213735938425, |
|
"grad_norm": 1.0267295837402344, |
|
"learning_rate": 2.3635846929243537e-05, |
|
"loss": 1.7877, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.5630550621669625, |
|
"grad_norm": 1.778398036956787, |
|
"learning_rate": 2.3326233162542655e-05, |
|
"loss": 1.9865, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.5748963883955003, |
|
"grad_norm": 1.0454083681106567, |
|
"learning_rate": 2.3016877064579564e-05, |
|
"loss": 1.8799, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.586737714624038, |
|
"grad_norm": 1.4005528688430786, |
|
"learning_rate": 2.2707826259308492e-05, |
|
"loss": 1.9329, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.5985790408525755, |
|
"grad_norm": 1.240485429763794, |
|
"learning_rate": 2.2399128323685286e-05, |
|
"loss": 1.7828, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.610420367081113, |
|
"grad_norm": 1.2078522443771362, |
|
"learning_rate": 2.2090830780343113e-05, |
|
"loss": 2.0114, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.6222616933096505, |
|
"grad_norm": 1.1760393381118774, |
|
"learning_rate": 2.1782981090276585e-05, |
|
"loss": 1.8671, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.6341030195381883, |
|
"grad_norm": 1.362109661102295, |
|
"learning_rate": 2.147562664553537e-05, |
|
"loss": 1.9453, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.645944345766726, |
|
"grad_norm": 2.057487964630127, |
|
"learning_rate": 2.1168814761928336e-05, |
|
"loss": 1.7151, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.6577856719952635, |
|
"grad_norm": 1.5289130210876465, |
|
"learning_rate": 2.0862592671739608e-05, |
|
"loss": 1.9591, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.669626998223801, |
|
"grad_norm": 1.42037832736969, |
|
"learning_rate": 2.0557007516457288e-05, |
|
"loss": 1.8999, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.6814683244523385, |
|
"grad_norm": 1.101355791091919, |
|
"learning_rate": 2.0252106339516272e-05, |
|
"loss": 2.0037, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.6933096506808762, |
|
"grad_norm": 5.125020980834961, |
|
"learning_rate": 1.9947936079056117e-05, |
|
"loss": 1.7028, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.705150976909414, |
|
"grad_norm": 1.3062840700149536, |
|
"learning_rate": 1.964454356069514e-05, |
|
"loss": 2.1283, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.7169923031379515, |
|
"grad_norm": 1.273743987083435, |
|
"learning_rate": 1.9341975490321827e-05, |
|
"loss": 2.0017, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.728833629366489, |
|
"grad_norm": 1.3251115083694458, |
|
"learning_rate": 1.9040278446904677e-05, |
|
"loss": 1.7329, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.7406749555950265, |
|
"grad_norm": 1.5570122003555298, |
|
"learning_rate": 1.873949887532156e-05, |
|
"loss": 1.9965, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.7525162818235642, |
|
"grad_norm": 1.3525424003601074, |
|
"learning_rate": 1.8439683079209787e-05, |
|
"loss": 1.9321, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.764357608052102, |
|
"grad_norm": 1.776936411857605, |
|
"learning_rate": 1.8140877213837823e-05, |
|
"loss": 1.9739, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.7761989342806395, |
|
"grad_norm": 1.317015290260315, |
|
"learning_rate": 1.7843127278999943e-05, |
|
"loss": 1.8864, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.788040260509177, |
|
"grad_norm": 1.6961033344268799, |
|
"learning_rate": 1.754647911193473e-05, |
|
"loss": 1.9227, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.7998815867377145, |
|
"grad_norm": 1.448065161705017, |
|
"learning_rate": 1.7250978380268694e-05, |
|
"loss": 2.0025, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.8117229129662522, |
|
"grad_norm": 1.4134864807128906, |
|
"learning_rate": 1.6956670574985908e-05, |
|
"loss": 1.8039, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.82356423919479, |
|
"grad_norm": 1.3354402780532837, |
|
"learning_rate": 1.6663601003424883e-05, |
|
"loss": 1.8008, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.8354055654233274, |
|
"grad_norm": 1.2575969696044922, |
|
"learning_rate": 1.6371814782303722e-05, |
|
"loss": 1.9609, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.847246891651865, |
|
"grad_norm": 1.4361883401870728, |
|
"learning_rate": 1.6081356830774625e-05, |
|
"loss": 2.0843, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.8590882178804025, |
|
"grad_norm": 1.320430874824524, |
|
"learning_rate": 1.579227186350875e-05, |
|
"loss": 1.8281, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.8709295441089402, |
|
"grad_norm": 1.2178723812103271, |
|
"learning_rate": 1.5504604383812646e-05, |
|
"loss": 1.9628, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.882770870337478, |
|
"grad_norm": 1.3220632076263428, |
|
"learning_rate": 1.5218398676777102e-05, |
|
"loss": 1.9867, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.8946121965660154, |
|
"grad_norm": 0.9402835965156555, |
|
"learning_rate": 1.4933698802459731e-05, |
|
"loss": 1.8819, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.906453522794553, |
|
"grad_norm": 1.3736234903335571, |
|
"learning_rate": 1.4650548589102092e-05, |
|
"loss": 1.8938, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.9182948490230904, |
|
"grad_norm": 0.9985255002975464, |
|
"learning_rate": 1.436899162638255e-05, |
|
"loss": 1.6936, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.9301361752516282, |
|
"grad_norm": 1.5471998453140259, |
|
"learning_rate": 1.4089071258705783e-05, |
|
"loss": 2.1922, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.941977501480166, |
|
"grad_norm": 1.515426754951477, |
|
"learning_rate": 1.3810830578530225e-05, |
|
"loss": 1.8547, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.9538188277087034, |
|
"grad_norm": 1.2444261312484741, |
|
"learning_rate": 1.3534312419734066e-05, |
|
"loss": 1.9437, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.965660153937241, |
|
"grad_norm": 1.3579230308532715, |
|
"learning_rate": 1.3259559351021247e-05, |
|
"loss": 1.7016, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.9775014801657784, |
|
"grad_norm": 1.436259388923645, |
|
"learning_rate": 1.2986613669368158e-05, |
|
"loss": 1.8801, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.9893428063943162, |
|
"grad_norm": 1.196303367614746, |
|
"learning_rate": 1.271551739351224e-05, |
|
"loss": 1.9408, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.001184132622854, |
|
"grad_norm": 1.1158932447433472, |
|
"learning_rate": 1.2446312257483358e-05, |
|
"loss": 1.6631, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 2.0130254588513914, |
|
"grad_norm": 1.18464195728302, |
|
"learning_rate": 1.2179039704179118e-05, |
|
"loss": 1.859, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.024866785079929, |
|
"grad_norm": 1.7019284963607788, |
|
"learning_rate": 1.1913740878984816e-05, |
|
"loss": 1.9671, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 2.0367081113084664, |
|
"grad_norm": 1.4283084869384766, |
|
"learning_rate": 1.1650456623439367e-05, |
|
"loss": 1.8109, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.0485494375370044, |
|
"grad_norm": 1.2812663316726685, |
|
"learning_rate": 1.1389227468947906e-05, |
|
"loss": 1.8024, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 2.060390763765542, |
|
"grad_norm": 1.4143946170806885, |
|
"learning_rate": 1.1130093630542198e-05, |
|
"loss": 1.8327, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.0722320899940794, |
|
"grad_norm": 1.2379989624023438, |
|
"learning_rate": 1.0873095000689675e-05, |
|
"loss": 1.9797, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 2.084073416222617, |
|
"grad_norm": 1.524370551109314, |
|
"learning_rate": 1.0618271143152184e-05, |
|
"loss": 1.8164, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.0959147424511544, |
|
"grad_norm": 1.7849695682525635, |
|
"learning_rate": 1.0365661286895365e-05, |
|
"loss": 1.7994, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 2.1077560686796923, |
|
"grad_norm": 2.3028218746185303, |
|
"learning_rate": 1.0115304320049479e-05, |
|
"loss": 1.9428, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.11959739490823, |
|
"grad_norm": 1.4404668807983398, |
|
"learning_rate": 9.867238783922789e-06, |
|
"loss": 1.9297, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 2.1314387211367674, |
|
"grad_norm": 2.096019744873047, |
|
"learning_rate": 9.621502867068285e-06, |
|
"loss": 1.8185, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.143280047365305, |
|
"grad_norm": 1.393315076828003, |
|
"learning_rate": 9.378134399404767e-06, |
|
"loss": 1.7902, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 2.1551213735938424, |
|
"grad_norm": 1.4423267841339111, |
|
"learning_rate": 9.137170846393054e-06, |
|
"loss": 1.7111, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.1669626998223803, |
|
"grad_norm": 1.1944553852081299, |
|
"learning_rate": 8.898649303268372e-06, |
|
"loss": 2.04, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 2.178804026050918, |
|
"grad_norm": 1.407127857208252, |
|
"learning_rate": 8.662606489329711e-06, |
|
"loss": 1.9046, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.1906453522794553, |
|
"grad_norm": 1.855188012123108, |
|
"learning_rate": 8.429078742287073e-06, |
|
"loss": 2.1426, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 2.202486678507993, |
|
"grad_norm": 1.5526237487792969, |
|
"learning_rate": 8.198102012667407e-06, |
|
"loss": 2.0743, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.2143280047365304, |
|
"grad_norm": 1.1409013271331787, |
|
"learning_rate": 7.969711858280252e-06, |
|
"loss": 1.7135, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 2.2261693309650683, |
|
"grad_norm": 1.7926199436187744, |
|
"learning_rate": 7.743943438743676e-06, |
|
"loss": 1.8054, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.238010657193606, |
|
"grad_norm": 2.43799090385437, |
|
"learning_rate": 7.520831510071744e-06, |
|
"loss": 1.8244, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 2.2498519834221433, |
|
"grad_norm": 1.3174521923065186, |
|
"learning_rate": 7.300410419323869e-06, |
|
"loss": 1.8097, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.261693309650681, |
|
"grad_norm": 1.5044059753417969, |
|
"learning_rate": 7.082714099317334e-06, |
|
"loss": 1.9919, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 2.2735346358792183, |
|
"grad_norm": 1.3440165519714355, |
|
"learning_rate": 6.867776063403411e-06, |
|
"loss": 1.9084, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.2853759621077563, |
|
"grad_norm": 1.5049198865890503, |
|
"learning_rate": 6.6556294003081914e-06, |
|
"loss": 1.7835, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 2.297217288336294, |
|
"grad_norm": 1.3639992475509644, |
|
"learning_rate": 6.44630676903869e-06, |
|
"loss": 1.7542, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.3090586145648313, |
|
"grad_norm": 1.5013291835784912, |
|
"learning_rate": 6.239840393855184e-06, |
|
"loss": 2.0351, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 2.320899940793369, |
|
"grad_norm": 1.4973437786102295, |
|
"learning_rate": 6.036262059310382e-06, |
|
"loss": 1.7908, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.3327412670219063, |
|
"grad_norm": 2.3206512928009033, |
|
"learning_rate": 5.835603105356396e-06, |
|
"loss": 1.7511, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 2.3445825932504443, |
|
"grad_norm": 1.5731827020645142, |
|
"learning_rate": 5.637894422520027e-06, |
|
"loss": 1.9621, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.3564239194789818, |
|
"grad_norm": 1.6597813367843628, |
|
"learning_rate": 5.443166447147391e-06, |
|
"loss": 2.0578, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 2.3682652457075193, |
|
"grad_norm": 1.659856915473938, |
|
"learning_rate": 5.251449156718313e-06, |
|
"loss": 1.9198, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.380106571936057, |
|
"grad_norm": 1.1122225522994995, |
|
"learning_rate": 5.062772065231491e-06, |
|
"loss": 1.6089, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 2.3919478981645943, |
|
"grad_norm": 1.6533218622207642, |
|
"learning_rate": 4.877164218660901e-06, |
|
"loss": 1.9499, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.4037892243931323, |
|
"grad_norm": 1.2786110639572144, |
|
"learning_rate": 4.694654190484327e-06, |
|
"loss": 2.0044, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 2.4156305506216698, |
|
"grad_norm": 1.542189359664917, |
|
"learning_rate": 4.515270077284595e-06, |
|
"loss": 2.0896, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.4274718768502073, |
|
"grad_norm": 1.5293947458267212, |
|
"learning_rate": 4.339039494424263e-06, |
|
"loss": 1.8803, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 2.4393132030787448, |
|
"grad_norm": 1.390453815460205, |
|
"learning_rate": 4.16598957179431e-06, |
|
"loss": 2.0795, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.4511545293072823, |
|
"grad_norm": 2.7319912910461426, |
|
"learning_rate": 3.996146949637658e-06, |
|
"loss": 1.7221, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 2.4629958555358202, |
|
"grad_norm": 1.080298900604248, |
|
"learning_rate": 3.8295377744479995e-06, |
|
"loss": 1.6881, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.4748371817643577, |
|
"grad_norm": 1.4576321840286255, |
|
"learning_rate": 3.6661876949447007e-06, |
|
"loss": 1.8039, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 2.4866785079928952, |
|
"grad_norm": 1.5691012144088745, |
|
"learning_rate": 3.5061218581242535e-06, |
|
"loss": 1.9333, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.4985198342214328, |
|
"grad_norm": 1.2626398801803589, |
|
"learning_rate": 3.3493649053890326e-06, |
|
"loss": 1.6731, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 2.5103611604499703, |
|
"grad_norm": 1.5094714164733887, |
|
"learning_rate": 3.1959409687538853e-06, |
|
"loss": 1.8437, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.522202486678508, |
|
"grad_norm": 2.1249232292175293, |
|
"learning_rate": 3.04587366713108e-06, |
|
"loss": 1.9577, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 2.5340438129070457, |
|
"grad_norm": 1.563010334968567, |
|
"learning_rate": 2.8991861026943014e-06, |
|
"loss": 2.0162, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.5458851391355832, |
|
"grad_norm": 2.769664764404297, |
|
"learning_rate": 2.7559008573221717e-06, |
|
"loss": 1.8091, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 2.5577264653641207, |
|
"grad_norm": 1.178745985031128, |
|
"learning_rate": 2.6160399891218988e-06, |
|
"loss": 1.7485, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.5695677915926582, |
|
"grad_norm": 1.479054570198059, |
|
"learning_rate": 2.4796250290334887e-06, |
|
"loss": 1.9556, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 2.581409117821196, |
|
"grad_norm": 1.1433284282684326, |
|
"learning_rate": 2.346676977515189e-06, |
|
"loss": 1.7762, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.5932504440497337, |
|
"grad_norm": 1.6979899406433105, |
|
"learning_rate": 2.21721630131054e-06, |
|
"loss": 1.7463, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 2.605091770278271, |
|
"grad_norm": 2.4911599159240723, |
|
"learning_rate": 2.0912629302976493e-06, |
|
"loss": 1.9825, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.6169330965068087, |
|
"grad_norm": 1.3858684301376343, |
|
"learning_rate": 1.968836254421036e-06, |
|
"loss": 1.8317, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 2.6287744227353462, |
|
"grad_norm": 1.304112195968628, |
|
"learning_rate": 1.849955120706673e-06, |
|
"loss": 1.7377, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.640615748963884, |
|
"grad_norm": 1.253029465675354, |
|
"learning_rate": 1.7346378303605359e-06, |
|
"loss": 1.8165, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 2.6524570751924217, |
|
"grad_norm": 1.166486382484436, |
|
"learning_rate": 1.6229021359512624e-06, |
|
"loss": 1.8233, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.664298401420959, |
|
"grad_norm": 1.5361779928207397, |
|
"learning_rate": 1.5147652386771848e-06, |
|
"loss": 1.9029, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 2.6761397276494967, |
|
"grad_norm": 1.3075627088546753, |
|
"learning_rate": 1.4102437857183155e-06, |
|
"loss": 1.8527, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.687981053878034, |
|
"grad_norm": 1.133543610572815, |
|
"learning_rate": 1.3093538676735601e-06, |
|
"loss": 1.8855, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 2.699822380106572, |
|
"grad_norm": 1.6549872159957886, |
|
"learning_rate": 1.2121110160836696e-06, |
|
"loss": 1.8746, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.7116637063351097, |
|
"grad_norm": 1.2035839557647705, |
|
"learning_rate": 1.1185302010402105e-06, |
|
"loss": 1.696, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 2.723505032563647, |
|
"grad_norm": 1.338542103767395, |
|
"learning_rate": 1.0286258288810107e-06, |
|
"loss": 1.7904, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.7353463587921847, |
|
"grad_norm": 1.4907745122909546, |
|
"learning_rate": 9.424117399723431e-07, |
|
"loss": 1.962, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 2.747187685020722, |
|
"grad_norm": 1.6321779489517212, |
|
"learning_rate": 8.599012065782924e-07, |
|
"loss": 1.9262, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.75902901124926, |
|
"grad_norm": 1.2635369300842285, |
|
"learning_rate": 7.811069308175156e-07, |
|
"loss": 1.8789, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 2.7708703374777977, |
|
"grad_norm": 1.5785208940505981, |
|
"learning_rate": 7.060410427078473e-07, |
|
"loss": 1.7546, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.782711663706335, |
|
"grad_norm": 1.1907869577407837, |
|
"learning_rate": 6.347150982989159e-07, |
|
"loss": 1.7979, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 2.7945529899348727, |
|
"grad_norm": 1.3581962585449219, |
|
"learning_rate": 5.671400778931468e-07, |
|
"loss": 1.7874, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.80639431616341, |
|
"grad_norm": 1.0370872020721436, |
|
"learning_rate": 5.033263843554015e-07, |
|
"loss": 1.6197, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 2.818235642391948, |
|
"grad_norm": 1.2103910446166992, |
|
"learning_rate": 4.4328384151149095e-07, |
|
"loss": 1.5225, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.8300769686204856, |
|
"grad_norm": 1.4887464046478271, |
|
"learning_rate": 3.8702169263585554e-07, |
|
"loss": 1.9811, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 2.841918294849023, |
|
"grad_norm": 1.4641401767730713, |
|
"learning_rate": 3.345485990286029e-07, |
|
"loss": 1.9999, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.8537596210775606, |
|
"grad_norm": 1.529491901397705, |
|
"learning_rate": 2.8587263868213585e-07, |
|
"loss": 2.1399, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 2.865600947306098, |
|
"grad_norm": 1.5479165315628052, |
|
"learning_rate": 2.410013050375859e-07, |
|
"loss": 1.8664, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.877442273534636, |
|
"grad_norm": 1.1747933626174927, |
|
"learning_rate": 1.999415058312276e-07, |
|
"loss": 1.9633, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 2.8892835997631736, |
|
"grad_norm": 1.358820915222168, |
|
"learning_rate": 1.6269956203107117e-07, |
|
"loss": 2.0106, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.901124925991711, |
|
"grad_norm": 1.554287314414978, |
|
"learning_rate": 1.2928120686377388e-07, |
|
"loss": 1.7896, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 2.9129662522202486, |
|
"grad_norm": 1.5915781259536743, |
|
"learning_rate": 9.969158493204067e-08, |
|
"loss": 1.8759, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.924807578448786, |
|
"grad_norm": 1.6586300134658813, |
|
"learning_rate": 7.393525142262991e-08, |
|
"loss": 1.9413, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 2.936648904677324, |
|
"grad_norm": 1.416338562965393, |
|
"learning_rate": 5.2016171405103174e-08, |
|
"loss": 1.8404, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.9484902309058616, |
|
"grad_norm": 1.3145766258239746, |
|
"learning_rate": 3.393771922142741e-08, |
|
"loss": 2.0654, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 2.960331557134399, |
|
"grad_norm": 1.3440285921096802, |
|
"learning_rate": 1.9702677966507154e-08, |
|
"loss": 1.8111, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.9721728833629366, |
|
"grad_norm": 1.3247921466827393, |
|
"learning_rate": 9.31323905974113e-09, |
|
"loss": 1.8708, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 2.984014209591474, |
|
"grad_norm": 1.7957427501678467, |
|
"learning_rate": 2.771001907653226e-09, |
|
"loss": 1.8842, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.995855535820012, |
|
"grad_norm": 1.4069582223892212, |
|
"learning_rate": 7.697365768943864e-11, |
|
"loss": 1.9665, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 2.9982238010657194, |
|
"step": 1266, |
|
"total_flos": 6.019503790030848e+16, |
|
"train_loss": 1.9697479162170988, |
|
"train_runtime": 4932.4291, |
|
"train_samples_per_second": 4.109, |
|
"train_steps_per_second": 0.257 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1266, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"total_flos": 6.019503790030848e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|