|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 35.0, |
|
"eval_steps": 500, |
|
"global_step": 175490, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0997207818109294, |
|
"grad_norm": 2.346997022628784, |
|
"learning_rate": 4.5023932987634625e-05, |
|
"loss": 8.2424, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.1994415636218588, |
|
"grad_norm": 2.3684158325195312, |
|
"learning_rate": 4.0037893897088155e-05, |
|
"loss": 7.6851, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.2991623454327882, |
|
"grad_norm": 3.409303665161133, |
|
"learning_rate": 3.5051854806541686e-05, |
|
"loss": 7.4872, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.3988831272437176, |
|
"grad_norm": 2.615360975265503, |
|
"learning_rate": 3.0065815715995216e-05, |
|
"loss": 7.344, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.49860390905464697, |
|
"grad_norm": 3.5242176055908203, |
|
"learning_rate": 2.5079776625448743e-05, |
|
"loss": 7.2749, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.5983246908655764, |
|
"grad_norm": 3.690262794494629, |
|
"learning_rate": 2.0093737534902273e-05, |
|
"loss": 7.1657, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.6980454726765057, |
|
"grad_norm": 2.940692663192749, |
|
"learning_rate": 1.5107698444355806e-05, |
|
"loss": 7.1298, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.7977662544874352, |
|
"grad_norm": 2.9132378101348877, |
|
"learning_rate": 1.0121659353809334e-05, |
|
"loss": 7.0938, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.8974870362983646, |
|
"grad_norm": 3.101921558380127, |
|
"learning_rate": 5.135620263262864e-06, |
|
"loss": 7.0715, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.9972078181092939, |
|
"grad_norm": 3.2258358001708984, |
|
"learning_rate": 1.495811727163941e-07, |
|
"loss": 7.0478, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.0969285999202234, |
|
"grad_norm": 3.2722208499908447, |
|
"learning_rate": 3.903270841643399e-05, |
|
"loss": 7.0374, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.1966493817311528, |
|
"grad_norm": 5.218217849731445, |
|
"learning_rate": 3.803550059832469e-05, |
|
"loss": 7.0289, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.2963701635420821, |
|
"grad_norm": 3.466571807861328, |
|
"learning_rate": 3.70382927802154e-05, |
|
"loss": 6.9595, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.3960909453530115, |
|
"grad_norm": 3.688443183898926, |
|
"learning_rate": 3.6041084962106106e-05, |
|
"loss": 6.9267, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.4958117271639408, |
|
"grad_norm": 3.0426700115203857, |
|
"learning_rate": 3.504387714399681e-05, |
|
"loss": 6.8954, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.5955325089748702, |
|
"grad_norm": 3.7769949436187744, |
|
"learning_rate": 3.404666932588751e-05, |
|
"loss": 6.8657, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.6952532907857998, |
|
"grad_norm": 3.0776305198669434, |
|
"learning_rate": 3.304946150777822e-05, |
|
"loss": 6.8285, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.7949740725967291, |
|
"grad_norm": 3.350515604019165, |
|
"learning_rate": 3.2052253689668926e-05, |
|
"loss": 6.7948, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.8946948544076585, |
|
"grad_norm": 3.393035411834717, |
|
"learning_rate": 3.1055045871559636e-05, |
|
"loss": 6.7725, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.994415636218588, |
|
"grad_norm": 3.438401222229004, |
|
"learning_rate": 3.0057838053450336e-05, |
|
"loss": 6.7484, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 2.0941364180295174, |
|
"grad_norm": 4.042023181915283, |
|
"learning_rate": 2.9060630235341047e-05, |
|
"loss": 6.6939, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 2.193857199840447, |
|
"grad_norm": 3.3481028079986572, |
|
"learning_rate": 2.8063422417231757e-05, |
|
"loss": 6.6854, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 2.293577981651376, |
|
"grad_norm": 3.266961097717285, |
|
"learning_rate": 2.706820901475868e-05, |
|
"loss": 6.6555, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 2.3932987634623055, |
|
"grad_norm": 3.215405225753784, |
|
"learning_rate": 2.607100119664938e-05, |
|
"loss": 6.6713, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 2.493019545273235, |
|
"grad_norm": 3.380500316619873, |
|
"learning_rate": 2.507379337854009e-05, |
|
"loss": 6.6581, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 2.5927403270841642, |
|
"grad_norm": 3.536166191101074, |
|
"learning_rate": 2.4076585560430796e-05, |
|
"loss": 6.5945, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 2.6924611088950936, |
|
"grad_norm": 3.9319474697113037, |
|
"learning_rate": 2.30793777423215e-05, |
|
"loss": 6.6057, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 2.792181890706023, |
|
"grad_norm": 4.334239482879639, |
|
"learning_rate": 2.2084164339848425e-05, |
|
"loss": 6.5818, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 2.8919026725169523, |
|
"grad_norm": 4.093286514282227, |
|
"learning_rate": 2.1086956521739132e-05, |
|
"loss": 6.5732, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 2.9916234543278817, |
|
"grad_norm": 4.026576995849609, |
|
"learning_rate": 2.008974870362984e-05, |
|
"loss": 6.5627, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 3.0913442361388115, |
|
"grad_norm": 3.7285637855529785, |
|
"learning_rate": 1.9092540885520542e-05, |
|
"loss": 6.5268, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 3.191065017949741, |
|
"grad_norm": 3.7349226474761963, |
|
"learning_rate": 1.809533306741125e-05, |
|
"loss": 6.5388, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 3.29078579976067, |
|
"grad_norm": 3.5330066680908203, |
|
"learning_rate": 1.7098125249301956e-05, |
|
"loss": 6.5141, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 3.3905065815715996, |
|
"grad_norm": 3.6961631774902344, |
|
"learning_rate": 1.6100917431192662e-05, |
|
"loss": 6.5013, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 3.490227363382529, |
|
"grad_norm": 3.413053274154663, |
|
"learning_rate": 1.5103709613083367e-05, |
|
"loss": 6.4932, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 3.5899481451934583, |
|
"grad_norm": 4.584457874298096, |
|
"learning_rate": 1.4108496210610292e-05, |
|
"loss": 6.4695, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 3.6896689270043876, |
|
"grad_norm": 3.3078787326812744, |
|
"learning_rate": 1.3111288392500998e-05, |
|
"loss": 6.4711, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 3.789389708815317, |
|
"grad_norm": 3.6679279804229736, |
|
"learning_rate": 1.2114080574391703e-05, |
|
"loss": 6.466, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 3.8891104906262464, |
|
"grad_norm": 4.358784198760986, |
|
"learning_rate": 1.1116872756282408e-05, |
|
"loss": 6.4568, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 3.988831272437176, |
|
"grad_norm": 4.014244556427002, |
|
"learning_rate": 1.0119664938173115e-05, |
|
"loss": 6.4536, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 4.0885520542481055, |
|
"grad_norm": 3.8396079540252686, |
|
"learning_rate": 9.122457120063822e-06, |
|
"loss": 6.443, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 4.188272836059035, |
|
"grad_norm": 3.850647449493408, |
|
"learning_rate": 8.125249301954529e-06, |
|
"loss": 6.4186, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 4.287993617869964, |
|
"grad_norm": 3.829951047897339, |
|
"learning_rate": 7.128041483845234e-06, |
|
"loss": 6.4178, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 4.387714399680894, |
|
"grad_norm": 3.5512278079986572, |
|
"learning_rate": 6.132828081372159e-06, |
|
"loss": 6.4055, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 4.487435181491823, |
|
"grad_norm": 3.568665027618408, |
|
"learning_rate": 5.135620263262864e-06, |
|
"loss": 6.4076, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 4.587155963302752, |
|
"grad_norm": 3.71463942527771, |
|
"learning_rate": 4.13841244515357e-06, |
|
"loss": 6.4086, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 4.686876745113682, |
|
"grad_norm": 3.9615983963012695, |
|
"learning_rate": 3.1412046270442757e-06, |
|
"loss": 6.4061, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 4.786597526924611, |
|
"grad_norm": 4.0287909507751465, |
|
"learning_rate": 2.1459912245712007e-06, |
|
"loss": 6.3772, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 4.88631830873554, |
|
"grad_norm": 4.012565612792969, |
|
"learning_rate": 1.1487834064619066e-06, |
|
"loss": 6.3956, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 4.98603909054647, |
|
"grad_norm": 4.36814022064209, |
|
"learning_rate": 1.515755883526127e-07, |
|
"loss": 6.3996, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 25070, |
|
"total_flos": 2.639861525017728e+16, |
|
"train_loss": 5.285465436767286, |
|
"train_runtime": 6500.188, |
|
"train_samples_per_second": 61.705, |
|
"train_steps_per_second": 3.857 |
|
}, |
|
{ |
|
"epoch": 5.085759872357399, |
|
"grad_norm": 3.5418105125427246, |
|
"learning_rate": 4.946400079776626e-05, |
|
"loss": 6.5458, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 5.1854806541683285, |
|
"grad_norm": 4.323005676269531, |
|
"learning_rate": 4.884074591144795e-05, |
|
"loss": 6.5604, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 5.285201435979258, |
|
"grad_norm": 4.445618629455566, |
|
"learning_rate": 4.8217491025129644e-05, |
|
"loss": 6.5452, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 5.384922217790187, |
|
"grad_norm": 4.320890426635742, |
|
"learning_rate": 4.759423613881133e-05, |
|
"loss": 6.5239, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 5.484642999601117, |
|
"grad_norm": 3.8980209827423096, |
|
"learning_rate": 4.697098125249302e-05, |
|
"loss": 6.5278, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 5.584363781412046, |
|
"grad_norm": 4.074916362762451, |
|
"learning_rate": 4.6347726366174716e-05, |
|
"loss": 6.5044, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 5.684084563222975, |
|
"grad_norm": 4.465285778045654, |
|
"learning_rate": 4.572447147985641e-05, |
|
"loss": 6.472, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 5.783805345033905, |
|
"grad_norm": 4.351347923278809, |
|
"learning_rate": 4.5101216593538095e-05, |
|
"loss": 6.4504, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 5.883526126844835, |
|
"grad_norm": 4.14565372467041, |
|
"learning_rate": 4.447796170721978e-05, |
|
"loss": 6.4375, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 5.983246908655763, |
|
"grad_norm": 4.669959545135498, |
|
"learning_rate": 4.3854706820901474e-05, |
|
"loss": 6.4393, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 6.082967690466694, |
|
"grad_norm": 4.345717430114746, |
|
"learning_rate": 4.323145193458317e-05, |
|
"loss": 6.3808, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 6.182688472277623, |
|
"grad_norm": 4.040054798126221, |
|
"learning_rate": 4.260819704826486e-05, |
|
"loss": 6.3705, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 6.282409254088552, |
|
"grad_norm": 4.663171291351318, |
|
"learning_rate": 4.198618867171919e-05, |
|
"loss": 6.3803, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 6.382130035899482, |
|
"grad_norm": 4.45890474319458, |
|
"learning_rate": 4.136293378540088e-05, |
|
"loss": 6.3256, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 6.481850817710411, |
|
"grad_norm": 4.158110618591309, |
|
"learning_rate": 4.073967889908257e-05, |
|
"loss": 6.3351, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 6.58157159952134, |
|
"grad_norm": 4.460795879364014, |
|
"learning_rate": 4.0116424012764265e-05, |
|
"loss": 6.3137, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 6.68129238133227, |
|
"grad_norm": 4.767895221710205, |
|
"learning_rate": 3.949316912644596e-05, |
|
"loss": 6.2751, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 6.781013163143199, |
|
"grad_norm": 4.399994850158691, |
|
"learning_rate": 3.887116074990028e-05, |
|
"loss": 6.2345, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 6.8807339449541285, |
|
"grad_norm": 4.522914886474609, |
|
"learning_rate": 3.8247905863581976e-05, |
|
"loss": 6.218, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 6.980454726765058, |
|
"grad_norm": 4.697731018066406, |
|
"learning_rate": 3.762465097726366e-05, |
|
"loss": 6.1819, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 7.080175508575987, |
|
"grad_norm": 5.113608360290527, |
|
"learning_rate": 3.7001396090945355e-05, |
|
"loss": 6.1566, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 7.179896290386917, |
|
"grad_norm": 4.987142086029053, |
|
"learning_rate": 3.637814120462705e-05, |
|
"loss": 6.1504, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 7.279617072197846, |
|
"grad_norm": 4.797494888305664, |
|
"learning_rate": 3.5756132828081373e-05, |
|
"loss": 6.0915, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 7.379337854008775, |
|
"grad_norm": 5.114543437957764, |
|
"learning_rate": 3.5132877941763066e-05, |
|
"loss": 6.0859, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 7.479058635819705, |
|
"grad_norm": 5.5212721824646, |
|
"learning_rate": 3.450962305544476e-05, |
|
"loss": 6.0643, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 7.578779417630634, |
|
"grad_norm": 4.77981424331665, |
|
"learning_rate": 3.3886368169126446e-05, |
|
"loss": 6.038, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 7.678500199441563, |
|
"grad_norm": 5.6912760734558105, |
|
"learning_rate": 3.326311328280814e-05, |
|
"loss": 6.0327, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 7.778220981252493, |
|
"grad_norm": 5.021594524383545, |
|
"learning_rate": 3.2641104906262464e-05, |
|
"loss": 6.0089, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 7.877941763063422, |
|
"grad_norm": 4.9512410163879395, |
|
"learning_rate": 3.201785001994416e-05, |
|
"loss": 5.9914, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 7.9776625448743514, |
|
"grad_norm": 4.6659088134765625, |
|
"learning_rate": 3.139459513362585e-05, |
|
"loss": 5.9688, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 8.07738332668528, |
|
"grad_norm": 5.084179401397705, |
|
"learning_rate": 3.601552017986003e-05, |
|
"loss": 5.9368, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 8.177104108496211, |
|
"grad_norm": 5.475657939910889, |
|
"learning_rate": 3.556224389890126e-05, |
|
"loss": 5.9181, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 8.27682489030714, |
|
"grad_norm": 4.678411960601807, |
|
"learning_rate": 3.510896761794249e-05, |
|
"loss": 5.8795, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 8.37654567211807, |
|
"grad_norm": 5.502169132232666, |
|
"learning_rate": 3.465569133698372e-05, |
|
"loss": 5.8389, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 8.476266453928998, |
|
"grad_norm": 5.32131290435791, |
|
"learning_rate": 3.420241505602495e-05, |
|
"loss": 5.8329, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 8.575987235739928, |
|
"grad_norm": 5.6808552742004395, |
|
"learning_rate": 3.374913877506618e-05, |
|
"loss": 5.8001, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 8.675708017550857, |
|
"grad_norm": 4.988351821899414, |
|
"learning_rate": 3.329586249410741e-05, |
|
"loss": 5.7928, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 8.775428799361787, |
|
"grad_norm": 5.559896469116211, |
|
"learning_rate": 3.284258621314864e-05, |
|
"loss": 5.7488, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 8.875149581172716, |
|
"grad_norm": 6.084516525268555, |
|
"learning_rate": 3.238930993218987e-05, |
|
"loss": 5.7262, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 8.974870362983646, |
|
"grad_norm": 6.219081401824951, |
|
"learning_rate": 3.19360336512311e-05, |
|
"loss": 5.6925, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 9.074591144794574, |
|
"grad_norm": 6.170139789581299, |
|
"learning_rate": 3.1482757370272333e-05, |
|
"loss": 5.6491, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 9.174311926605505, |
|
"grad_norm": 5.830073356628418, |
|
"learning_rate": 3.102948108931356e-05, |
|
"loss": 5.6228, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 9.274032708416435, |
|
"grad_norm": 5.452333927154541, |
|
"learning_rate": 3.0577111360916706e-05, |
|
"loss": 5.5724, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 9.373753490227363, |
|
"grad_norm": 5.113864421844482, |
|
"learning_rate": 3.0123835079957935e-05, |
|
"loss": 5.5437, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 9.473474272038294, |
|
"grad_norm": 5.875530242919922, |
|
"learning_rate": 2.9670558798999164e-05, |
|
"loss": 5.525, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 9.573195053849222, |
|
"grad_norm": 5.342255592346191, |
|
"learning_rate": 2.9217282518040397e-05, |
|
"loss": 5.5145, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 9.672915835660152, |
|
"grad_norm": 6.1103644371032715, |
|
"learning_rate": 2.8764006237081626e-05, |
|
"loss": 5.4687, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 9.77263661747108, |
|
"grad_norm": 6.640170097351074, |
|
"learning_rate": 2.8310729956122855e-05, |
|
"loss": 5.4448, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 9.872357399282011, |
|
"grad_norm": 6.135842323303223, |
|
"learning_rate": 2.7858360227726005e-05, |
|
"loss": 5.4075, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 9.97207818109294, |
|
"grad_norm": 6.063602924346924, |
|
"learning_rate": 2.7405083946767234e-05, |
|
"loss": 5.374, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 10.07179896290387, |
|
"grad_norm": 6.689053535461426, |
|
"learning_rate": 2.6951807665808463e-05, |
|
"loss": 5.3459, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 10.171519744714798, |
|
"grad_norm": 6.488341331481934, |
|
"learning_rate": 2.6498531384849696e-05, |
|
"loss": 5.3185, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 10.271240526525728, |
|
"grad_norm": 6.589330673217773, |
|
"learning_rate": 2.6045255103890925e-05, |
|
"loss": 5.3019, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 10.370961308336657, |
|
"grad_norm": 6.61977481842041, |
|
"learning_rate": 2.5592885375494075e-05, |
|
"loss": 5.2792, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 10.470682090147587, |
|
"grad_norm": 6.396610736846924, |
|
"learning_rate": 2.5139609094535304e-05, |
|
"loss": 5.2347, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 10.570402871958516, |
|
"grad_norm": 7.000791549682617, |
|
"learning_rate": 2.4686332813576534e-05, |
|
"loss": 5.2252, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 10.670123653769446, |
|
"grad_norm": 6.714987277984619, |
|
"learning_rate": 2.4233056532617763e-05, |
|
"loss": 5.1965, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 10.769844435580374, |
|
"grad_norm": 7.012180805206299, |
|
"learning_rate": 2.3779780251658992e-05, |
|
"loss": 5.1769, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 10.869565217391305, |
|
"grad_norm": 6.85835599899292, |
|
"learning_rate": 2.332650397070022e-05, |
|
"loss": 5.1442, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 10.969285999202233, |
|
"grad_norm": 6.789878845214844, |
|
"learning_rate": 2.2873227689741453e-05, |
|
"loss": 5.1071, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"step": 55154, |
|
"total_flos": 5.807695355039002e+16, |
|
"train_loss": 1.5156397336923944, |
|
"train_runtime": 4860.501, |
|
"train_samples_per_second": 181.547, |
|
"train_steps_per_second": 11.347 |
|
}, |
|
{ |
|
"epoch": 11.069006781013163, |
|
"grad_norm": 7.099039077758789, |
|
"learning_rate": 4.976997739662279e-05, |
|
"loss": 5.2813, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 11.168727562824092, |
|
"grad_norm": 6.935009479522705, |
|
"learning_rate": 4.943757479058636e-05, |
|
"loss": 5.2781, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 11.268448344635022, |
|
"grad_norm": 8.239794731140137, |
|
"learning_rate": 4.910517218454993e-05, |
|
"loss": 5.2531, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 11.36816912644595, |
|
"grad_norm": 6.757853031158447, |
|
"learning_rate": 4.87727695785135e-05, |
|
"loss": 5.1861, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 11.46788990825688, |
|
"grad_norm": 7.666926383972168, |
|
"learning_rate": 4.844036697247707e-05, |
|
"loss": 5.1783, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 11.56761069006781, |
|
"grad_norm": 7.166041374206543, |
|
"learning_rate": 4.810796436644063e-05, |
|
"loss": 5.1202, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 11.66733147187874, |
|
"grad_norm": 7.543915748596191, |
|
"learning_rate": 4.77755617604042e-05, |
|
"loss": 5.0482, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 11.76705225368967, |
|
"grad_norm": 8.00036907196045, |
|
"learning_rate": 4.744315915436777e-05, |
|
"loss": 5.0167, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 11.866773035500598, |
|
"grad_norm": 6.7936272621154785, |
|
"learning_rate": 4.711075654833134e-05, |
|
"loss": 4.9823, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 11.966493817311529, |
|
"grad_norm": 7.003523826599121, |
|
"learning_rate": 4.677835394229491e-05, |
|
"loss": 4.9457, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 12.066214599122457, |
|
"grad_norm": 7.01780891418457, |
|
"learning_rate": 4.644595133625848e-05, |
|
"loss": 4.825, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 12.165935380933387, |
|
"grad_norm": 7.654853820800781, |
|
"learning_rate": 4.6113548730222045e-05, |
|
"loss": 4.7741, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 12.265656162744316, |
|
"grad_norm": 7.968235492706299, |
|
"learning_rate": 4.578181092939769e-05, |
|
"loss": 4.7404, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 12.365376944555246, |
|
"grad_norm": 7.112838268280029, |
|
"learning_rate": 4.544940832336126e-05, |
|
"loss": 4.6502, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 12.465097726366174, |
|
"grad_norm": 6.567187786102295, |
|
"learning_rate": 4.511700571732483e-05, |
|
"loss": 4.6277, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 12.564818508177105, |
|
"grad_norm": 6.989046096801758, |
|
"learning_rate": 4.478460311128839e-05, |
|
"loss": 4.5757, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 12.664539289988033, |
|
"grad_norm": 6.270955562591553, |
|
"learning_rate": 4.445220050525196e-05, |
|
"loss": 4.5394, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 12.764260071798963, |
|
"grad_norm": 6.227508544921875, |
|
"learning_rate": 4.412046270442761e-05, |
|
"loss": 4.4651, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 12.863980853609892, |
|
"grad_norm": 6.464995861053467, |
|
"learning_rate": 4.378806009839118e-05, |
|
"loss": 4.423, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 12.963701635420822, |
|
"grad_norm": 6.102914810180664, |
|
"learning_rate": 4.345565749235474e-05, |
|
"loss": 4.3969, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 13.06342241723175, |
|
"grad_norm": 6.3487067222595215, |
|
"learning_rate": 4.312325488631831e-05, |
|
"loss": 4.2689, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 13.16314319904268, |
|
"grad_norm": 6.235875129699707, |
|
"learning_rate": 4.279085228028188e-05, |
|
"loss": 4.2232, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 13.26286398085361, |
|
"grad_norm": 5.931600570678711, |
|
"learning_rate": 4.245844967424545e-05, |
|
"loss": 4.222, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 13.36258476266454, |
|
"grad_norm": 5.873235702514648, |
|
"learning_rate": 4.212604706820902e-05, |
|
"loss": 4.1722, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 13.462305544475468, |
|
"grad_norm": 6.30717134475708, |
|
"learning_rate": 4.179364446217259e-05, |
|
"loss": 4.1255, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 13.562026326286398, |
|
"grad_norm": 5.893185138702393, |
|
"learning_rate": 4.146190666134823e-05, |
|
"loss": 4.0975, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 13.661747108097327, |
|
"grad_norm": 6.775746822357178, |
|
"learning_rate": 4.113016886052387e-05, |
|
"loss": 4.0787, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 13.761467889908257, |
|
"grad_norm": 5.948095798492432, |
|
"learning_rate": 4.0797766254487435e-05, |
|
"loss": 4.0581, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 13.861188671719185, |
|
"grad_norm": 5.961909770965576, |
|
"learning_rate": 4.0465363648451005e-05, |
|
"loss": 4.0097, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 13.960909453530116, |
|
"grad_norm": 5.72122859954834, |
|
"learning_rate": 4.0132961042414575e-05, |
|
"loss": 3.9751, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 14.060630235341046, |
|
"grad_norm": 6.1757378578186035, |
|
"learning_rate": 3.980122324159022e-05, |
|
"loss": 3.9707, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 14.160351017151974, |
|
"grad_norm": 5.7611236572265625, |
|
"learning_rate": 3.946882063555378e-05, |
|
"loss": 3.9126, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 14.260071798962905, |
|
"grad_norm": 6.233034133911133, |
|
"learning_rate": 3.913641802951735e-05, |
|
"loss": 3.9005, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 14.359792580773833, |
|
"grad_norm": 6.282217979431152, |
|
"learning_rate": 3.880401542348092e-05, |
|
"loss": 3.8648, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 14.459513362584763, |
|
"grad_norm": 6.495648384094238, |
|
"learning_rate": 3.847161281744449e-05, |
|
"loss": 3.8567, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 14.559234144395692, |
|
"grad_norm": 6.3030195236206055, |
|
"learning_rate": 3.813921021140806e-05, |
|
"loss": 3.839, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 14.658954926206622, |
|
"grad_norm": 5.807531833648682, |
|
"learning_rate": 3.78074724105837e-05, |
|
"loss": 3.8156, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 14.75867570801755, |
|
"grad_norm": 5.283077716827393, |
|
"learning_rate": 3.747506980454727e-05, |
|
"loss": 3.8142, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 14.85839648982848, |
|
"grad_norm": 5.933303356170654, |
|
"learning_rate": 3.714266719851084e-05, |
|
"loss": 3.8109, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 14.95811727163941, |
|
"grad_norm": 6.217842102050781, |
|
"learning_rate": 3.681026459247441e-05, |
|
"loss": 3.7937, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"step": 75210, |
|
"total_flos": 7.919584575053184e+16, |
|
"train_loss": 1.1771604976443562, |
|
"train_runtime": 4937.567, |
|
"train_samples_per_second": 243.7, |
|
"train_steps_per_second": 15.232 |
|
}, |
|
{ |
|
"epoch": 15.05783805345034, |
|
"grad_norm": 6.581785202026367, |
|
"learning_rate": 4.985540486637415e-05, |
|
"loss": 3.8491, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 15.157558835261268, |
|
"grad_norm": 6.372396469116211, |
|
"learning_rate": 4.960610291184683e-05, |
|
"loss": 3.8838, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 15.257279617072198, |
|
"grad_norm": 6.738864421844482, |
|
"learning_rate": 4.935680095731951e-05, |
|
"loss": 3.8834, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 15.357000398883127, |
|
"grad_norm": 6.700061798095703, |
|
"learning_rate": 4.910749900279219e-05, |
|
"loss": 3.8559, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 15.456721180694057, |
|
"grad_norm": 6.3839497566223145, |
|
"learning_rate": 4.8858197048264857e-05, |
|
"loss": 3.8275, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 15.556441962504985, |
|
"grad_norm": 6.165511131286621, |
|
"learning_rate": 4.860889509373754e-05, |
|
"loss": 3.803, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 15.656162744315916, |
|
"grad_norm": 5.800929069519043, |
|
"learning_rate": 4.8359593139210215e-05, |
|
"loss": 3.8, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 15.755883526126844, |
|
"grad_norm": 6.714051246643066, |
|
"learning_rate": 4.811029118468289e-05, |
|
"loss": 3.797, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 15.855604307937774, |
|
"grad_norm": 6.74777889251709, |
|
"learning_rate": 4.786098923015557e-05, |
|
"loss": 3.7759, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 15.955325089748703, |
|
"grad_norm": 6.980929374694824, |
|
"learning_rate": 4.761168727562824e-05, |
|
"loss": 3.7445, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 16.05504587155963, |
|
"grad_norm": 6.54088020324707, |
|
"learning_rate": 4.736238532110092e-05, |
|
"loss": 3.6805, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 16.15476665337056, |
|
"grad_norm": 5.999478340148926, |
|
"learning_rate": 4.7113083366573594e-05, |
|
"loss": 3.6537, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 16.254487435181492, |
|
"grad_norm": 6.384885311126709, |
|
"learning_rate": 4.686378141204627e-05, |
|
"loss": 3.6522, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 16.354208216992422, |
|
"grad_norm": 6.624803066253662, |
|
"learning_rate": 4.661447945751895e-05, |
|
"loss": 3.6302, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 16.453928998803352, |
|
"grad_norm": 6.454346656799316, |
|
"learning_rate": 4.636567610690068e-05, |
|
"loss": 3.6179, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 16.55364978061428, |
|
"grad_norm": 6.266842365264893, |
|
"learning_rate": 4.611637415237336e-05, |
|
"loss": 3.6265, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 16.65337056242521, |
|
"grad_norm": 6.608065128326416, |
|
"learning_rate": 4.5867072197846036e-05, |
|
"loss": 3.6105, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 16.75309134423614, |
|
"grad_norm": 6.4489426612854, |
|
"learning_rate": 4.5617770243318705e-05, |
|
"loss": 3.5994, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 16.85281212604707, |
|
"grad_norm": 6.433938503265381, |
|
"learning_rate": 4.536896689270044e-05, |
|
"loss": 3.5648, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 16.952532907857996, |
|
"grad_norm": 7.4558610916137695, |
|
"learning_rate": 4.511966493817312e-05, |
|
"loss": 3.5746, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 17.052253689668927, |
|
"grad_norm": 5.742049217224121, |
|
"learning_rate": 4.4870362983645795e-05, |
|
"loss": 3.5378, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 17.151974471479857, |
|
"grad_norm": 6.346868515014648, |
|
"learning_rate": 4.462106102911847e-05, |
|
"loss": 3.505, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 17.251695253290787, |
|
"grad_norm": 6.252668857574463, |
|
"learning_rate": 4.4371759074591147e-05, |
|
"loss": 3.4787, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 17.351416035101714, |
|
"grad_norm": 6.237195014953613, |
|
"learning_rate": 4.412245712006383e-05, |
|
"loss": 3.4914, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 17.451136816912644, |
|
"grad_norm": 7.106077194213867, |
|
"learning_rate": 4.3873653769445554e-05, |
|
"loss": 3.4641, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 17.550857598723574, |
|
"grad_norm": 7.160710334777832, |
|
"learning_rate": 4.362435181491823e-05, |
|
"loss": 3.4419, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 17.650578380534505, |
|
"grad_norm": 7.160135746002197, |
|
"learning_rate": 4.337504986039091e-05, |
|
"loss": 3.4604, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 17.75029916234543, |
|
"grad_norm": 6.785101890563965, |
|
"learning_rate": 4.312574790586358e-05, |
|
"loss": 3.432, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 17.85001994415636, |
|
"grad_norm": 5.990314960479736, |
|
"learning_rate": 4.287644595133626e-05, |
|
"loss": 3.4045, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 17.949740725967292, |
|
"grad_norm": 6.434844493865967, |
|
"learning_rate": 4.2627642600717995e-05, |
|
"loss": 3.4236, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 18.049461507778222, |
|
"grad_norm": 6.7937774658203125, |
|
"learning_rate": 4.2378340646190664e-05, |
|
"loss": 3.3902, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 18.14918228958915, |
|
"grad_norm": 7.1783576011657715, |
|
"learning_rate": 4.212903869166335e-05, |
|
"loss": 3.3545, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 18.24890307140008, |
|
"grad_norm": 6.374876022338867, |
|
"learning_rate": 4.187973673713602e-05, |
|
"loss": 3.3451, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 18.34862385321101, |
|
"grad_norm": 6.49647331237793, |
|
"learning_rate": 4.163093338651775e-05, |
|
"loss": 3.3452, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 18.44834463502194, |
|
"grad_norm": 6.785512924194336, |
|
"learning_rate": 4.138163143199043e-05, |
|
"loss": 3.3102, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 18.54806541683287, |
|
"grad_norm": 6.842392921447754, |
|
"learning_rate": 4.1132329477463106e-05, |
|
"loss": 3.3376, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 18.647786198643796, |
|
"grad_norm": 7.126637935638428, |
|
"learning_rate": 4.088302752293578e-05, |
|
"loss": 3.3249, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 18.747506980454727, |
|
"grad_norm": 5.808903217315674, |
|
"learning_rate": 4.063372556840846e-05, |
|
"loss": 3.2808, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 18.847227762265657, |
|
"grad_norm": 6.2346954345703125, |
|
"learning_rate": 4.0385420821699245e-05, |
|
"loss": 3.3189, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 18.946948544076587, |
|
"grad_norm": 6.60822057723999, |
|
"learning_rate": 4.013611886717192e-05, |
|
"loss": 3.3143, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 19.046669325887514, |
|
"grad_norm": 6.471176624298096, |
|
"learning_rate": 3.9886816912644597e-05, |
|
"loss": 3.2855, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 19.146390107698444, |
|
"grad_norm": 6.365059852600098, |
|
"learning_rate": 3.963751495811727e-05, |
|
"loss": 3.2616, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 19.246110889509374, |
|
"grad_norm": 6.250296592712402, |
|
"learning_rate": 3.9388213003589955e-05, |
|
"loss": 3.226, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 19.345831671320305, |
|
"grad_norm": 6.003506660461426, |
|
"learning_rate": 3.9138911049062624e-05, |
|
"loss": 3.2352, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 19.44555245313123, |
|
"grad_norm": 5.75541353225708, |
|
"learning_rate": 3.88896090945353e-05, |
|
"loss": 3.2395, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 19.54527323494216, |
|
"grad_norm": 6.684996604919434, |
|
"learning_rate": 3.864030714000798e-05, |
|
"loss": 3.2272, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 19.644994016753092, |
|
"grad_norm": 5.906820297241211, |
|
"learning_rate": 3.839100518548066e-05, |
|
"loss": 3.2096, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 19.744714798564022, |
|
"grad_norm": 6.240872383117676, |
|
"learning_rate": 3.814220183486238e-05, |
|
"loss": 3.2016, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 19.84443558037495, |
|
"grad_norm": 6.751197338104248, |
|
"learning_rate": 3.7892899880335066e-05, |
|
"loss": 3.2141, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 19.94415636218588, |
|
"grad_norm": 6.535121917724609, |
|
"learning_rate": 3.764359792580774e-05, |
|
"loss": 3.1829, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"step": 100280, |
|
"total_flos": 1.0559446100070912e+17, |
|
"train_loss": 0.8717835233465437, |
|
"train_runtime": 7516.252, |
|
"train_samples_per_second": 213.455, |
|
"train_steps_per_second": 13.342 |
|
}, |
|
{ |
|
"epoch": 20.04387714399681, |
|
"grad_norm": 6.778732776641846, |
|
"learning_rate": 4.9912245712006384e-05, |
|
"loss": 3.2388, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 20.14359792580774, |
|
"grad_norm": 6.354984760284424, |
|
"learning_rate": 4.9712804148384526e-05, |
|
"loss": 3.2858, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 20.243318707618666, |
|
"grad_norm": 7.301539897918701, |
|
"learning_rate": 4.951336258476267e-05, |
|
"loss": 3.3016, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 20.343039489429597, |
|
"grad_norm": 7.8318772315979, |
|
"learning_rate": 4.931392102114081e-05, |
|
"loss": 3.2969, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 20.442760271240527, |
|
"grad_norm": 6.826496124267578, |
|
"learning_rate": 4.9114479457518946e-05, |
|
"loss": 3.315, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 20.542481053051457, |
|
"grad_norm": 6.47593879699707, |
|
"learning_rate": 4.8915037893897094e-05, |
|
"loss": 3.2395, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 20.642201834862384, |
|
"grad_norm": 6.942465782165527, |
|
"learning_rate": 4.871559633027523e-05, |
|
"loss": 3.2812, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 20.741922616673314, |
|
"grad_norm": 6.694247722625732, |
|
"learning_rate": 4.851615476665337e-05, |
|
"loss": 3.2757, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 20.841643398484244, |
|
"grad_norm": 6.374402046203613, |
|
"learning_rate": 4.8316713203031514e-05, |
|
"loss": 3.2517, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 20.941364180295174, |
|
"grad_norm": 7.804276943206787, |
|
"learning_rate": 4.8117271639409656e-05, |
|
"loss": 3.2417, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 21.041084962106105, |
|
"grad_norm": 7.735393524169922, |
|
"learning_rate": 4.791822895891504e-05, |
|
"loss": 3.2124, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 21.14080574391703, |
|
"grad_norm": 6.500980377197266, |
|
"learning_rate": 4.771878739529318e-05, |
|
"loss": 3.1786, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 21.24052652572796, |
|
"grad_norm": 6.206303119659424, |
|
"learning_rate": 4.751934583167133e-05, |
|
"loss": 3.188, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 21.340247307538892, |
|
"grad_norm": 7.221670150756836, |
|
"learning_rate": 4.731990426804946e-05, |
|
"loss": 3.1658, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 21.439968089349822, |
|
"grad_norm": 6.705102443695068, |
|
"learning_rate": 4.7120462704427605e-05, |
|
"loss": 3.1698, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 21.53968887116075, |
|
"grad_norm": 7.459311485290527, |
|
"learning_rate": 4.692102114080575e-05, |
|
"loss": 3.1263, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 21.63940965297168, |
|
"grad_norm": 6.276129245758057, |
|
"learning_rate": 4.6721978460311135e-05, |
|
"loss": 3.1438, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 21.73913043478261, |
|
"grad_norm": 6.849742412567139, |
|
"learning_rate": 4.652253689668927e-05, |
|
"loss": 3.1688, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 21.83885121659354, |
|
"grad_norm": 6.463535308837891, |
|
"learning_rate": 4.632309533306741e-05, |
|
"loss": 3.1261, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 21.938571998404466, |
|
"grad_norm": 6.4734063148498535, |
|
"learning_rate": 4.6123653769445554e-05, |
|
"loss": 3.1375, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 22.038292780215397, |
|
"grad_norm": 6.659780025482178, |
|
"learning_rate": 4.5924212205823696e-05, |
|
"loss": 3.1488, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 22.138013562026327, |
|
"grad_norm": 6.0405402183532715, |
|
"learning_rate": 4.572477064220184e-05, |
|
"loss": 3.0816, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 22.237734343837257, |
|
"grad_norm": 6.467530727386475, |
|
"learning_rate": 4.5525329078579974e-05, |
|
"loss": 3.0573, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 22.337455125648184, |
|
"grad_norm": 7.352579116821289, |
|
"learning_rate": 4.532588751495812e-05, |
|
"loss": 3.054, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 22.437175907459114, |
|
"grad_norm": 6.598001956939697, |
|
"learning_rate": 4.5126844834463503e-05, |
|
"loss": 3.0912, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 22.536896689270044, |
|
"grad_norm": 7.065674304962158, |
|
"learning_rate": 4.492780215396889e-05, |
|
"loss": 3.0784, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 22.636617471080974, |
|
"grad_norm": 6.545448303222656, |
|
"learning_rate": 4.472836059034703e-05, |
|
"loss": 3.0489, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 22.7363382528919, |
|
"grad_norm": 6.2428059577941895, |
|
"learning_rate": 4.4528919026725175e-05, |
|
"loss": 3.0412, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 22.83605903470283, |
|
"grad_norm": 6.4470367431640625, |
|
"learning_rate": 4.432947746310331e-05, |
|
"loss": 3.0359, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 22.93577981651376, |
|
"grad_norm": 6.093207836151123, |
|
"learning_rate": 4.413003589948145e-05, |
|
"loss": 3.0304, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 23.035500598324692, |
|
"grad_norm": 6.75270414352417, |
|
"learning_rate": 4.3930594335859595e-05, |
|
"loss": 3.02, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 23.13522138013562, |
|
"grad_norm": 6.7165374755859375, |
|
"learning_rate": 4.373115277223774e-05, |
|
"loss": 3.0069, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 23.23494216194655, |
|
"grad_norm": 5.961038589477539, |
|
"learning_rate": 4.353171120861588e-05, |
|
"loss": 2.9626, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 23.33466294375748, |
|
"grad_norm": 6.657290935516357, |
|
"learning_rate": 4.333266852812126e-05, |
|
"loss": 2.9839, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 23.43438372556841, |
|
"grad_norm": 6.603748798370361, |
|
"learning_rate": 4.31332269644994e-05, |
|
"loss": 2.984, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 23.53410450737934, |
|
"grad_norm": 6.49187707901001, |
|
"learning_rate": 4.2933785400877544e-05, |
|
"loss": 2.9815, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 23.633825289190266, |
|
"grad_norm": 7.08600378036499, |
|
"learning_rate": 4.273474272038293e-05, |
|
"loss": 2.9877, |
|
"step": 118500 |
|
}, |
|
{ |
|
"epoch": 23.733546071001197, |
|
"grad_norm": 6.5724077224731445, |
|
"learning_rate": 4.253530115676107e-05, |
|
"loss": 2.965, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 23.833266852812127, |
|
"grad_norm": 6.058481693267822, |
|
"learning_rate": 4.233585959313921e-05, |
|
"loss": 2.9759, |
|
"step": 119500 |
|
}, |
|
{ |
|
"epoch": 23.932987634623057, |
|
"grad_norm": 7.042490482330322, |
|
"learning_rate": 4.213641802951736e-05, |
|
"loss": 2.9619, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 24.032708416433984, |
|
"grad_norm": 6.764120578765869, |
|
"learning_rate": 4.193697646589549e-05, |
|
"loss": 2.9482, |
|
"step": 120500 |
|
}, |
|
{ |
|
"epoch": 24.132429198244914, |
|
"grad_norm": 6.224752426147461, |
|
"learning_rate": 4.1737534902273635e-05, |
|
"loss": 2.9368, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 24.232149980055844, |
|
"grad_norm": 6.817770481109619, |
|
"learning_rate": 4.153809333865178e-05, |
|
"loss": 2.9325, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 24.331870761866774, |
|
"grad_norm": 6.26372766494751, |
|
"learning_rate": 4.133865177502992e-05, |
|
"loss": 2.8953, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 24.4315915436777, |
|
"grad_norm": 7.136674880981445, |
|
"learning_rate": 4.11396090945353e-05, |
|
"loss": 2.9019, |
|
"step": 122500 |
|
}, |
|
{ |
|
"epoch": 24.53131232548863, |
|
"grad_norm": 6.46077299118042, |
|
"learning_rate": 4.094016753091344e-05, |
|
"loss": 2.9091, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 24.63103310729956, |
|
"grad_norm": 6.0465288162231445, |
|
"learning_rate": 4.0740725967291584e-05, |
|
"loss": 2.9107, |
|
"step": 123500 |
|
}, |
|
{ |
|
"epoch": 24.730753889110492, |
|
"grad_norm": 6.354468822479248, |
|
"learning_rate": 4.0541284403669726e-05, |
|
"loss": 2.9206, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 24.83047467092142, |
|
"grad_norm": 6.679784297943115, |
|
"learning_rate": 4.0342241723175114e-05, |
|
"loss": 2.8901, |
|
"step": 124500 |
|
}, |
|
{ |
|
"epoch": 24.93019545273235, |
|
"grad_norm": 6.418820858001709, |
|
"learning_rate": 4.014280015955325e-05, |
|
"loss": 2.8971, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"step": 125350, |
|
"total_flos": 1.319930762508864e+17, |
|
"train_loss": 0.6150265672133651, |
|
"train_runtime": 7682.6614, |
|
"train_samples_per_second": 261.039, |
|
"train_steps_per_second": 16.316 |
|
}, |
|
{ |
|
"epoch": 25.02991623454328, |
|
"grad_norm": 6.8266754150390625, |
|
"learning_rate": 4.9950139609094536e-05, |
|
"loss": 2.9041, |
|
"step": 125500 |
|
}, |
|
{ |
|
"epoch": 25.12963701635421, |
|
"grad_norm": 7.047895431518555, |
|
"learning_rate": 4.9783938306076325e-05, |
|
"loss": 2.9501, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 25.229357798165136, |
|
"grad_norm": 6.489243507385254, |
|
"learning_rate": 4.9617737003058106e-05, |
|
"loss": 2.9795, |
|
"step": 126500 |
|
}, |
|
{ |
|
"epoch": 25.329078579976066, |
|
"grad_norm": 6.933114528656006, |
|
"learning_rate": 4.9451535700039895e-05, |
|
"loss": 2.9906, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 25.428799361786997, |
|
"grad_norm": 7.721564769744873, |
|
"learning_rate": 4.9285334397021676e-05, |
|
"loss": 2.9822, |
|
"step": 127500 |
|
}, |
|
{ |
|
"epoch": 25.528520143597927, |
|
"grad_norm": 7.604334831237793, |
|
"learning_rate": 4.911913309400346e-05, |
|
"loss": 2.9751, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 25.628240925408853, |
|
"grad_norm": 6.689730644226074, |
|
"learning_rate": 4.8952931790985246e-05, |
|
"loss": 2.9806, |
|
"step": 128500 |
|
}, |
|
{ |
|
"epoch": 25.727961707219784, |
|
"grad_norm": 7.001711368560791, |
|
"learning_rate": 4.878673048796703e-05, |
|
"loss": 2.9701, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 25.827682489030714, |
|
"grad_norm": 6.627374649047852, |
|
"learning_rate": 4.862052918494881e-05, |
|
"loss": 2.982, |
|
"step": 129500 |
|
}, |
|
{ |
|
"epoch": 25.927403270841644, |
|
"grad_norm": 6.500030517578125, |
|
"learning_rate": 4.8454660284536635e-05, |
|
"loss": 2.9497, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 26.027124052652574, |
|
"grad_norm": 6.908927917480469, |
|
"learning_rate": 4.828845898151842e-05, |
|
"loss": 2.9201, |
|
"step": 130500 |
|
}, |
|
{ |
|
"epoch": 26.1268448344635, |
|
"grad_norm": 7.953597068786621, |
|
"learning_rate": 4.8122257678500205e-05, |
|
"loss": 2.8916, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 26.22656561627443, |
|
"grad_norm": 7.111712455749512, |
|
"learning_rate": 4.795605637548199e-05, |
|
"loss": 2.8983, |
|
"step": 131500 |
|
}, |
|
{ |
|
"epoch": 26.32628639808536, |
|
"grad_norm": 7.099549293518066, |
|
"learning_rate": 4.778985507246377e-05, |
|
"loss": 2.8862, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 26.426007179896292, |
|
"grad_norm": 6.708031177520752, |
|
"learning_rate": 4.762365376944555e-05, |
|
"loss": 2.8828, |
|
"step": 132500 |
|
}, |
|
{ |
|
"epoch": 26.52572796170722, |
|
"grad_norm": 6.638050079345703, |
|
"learning_rate": 4.745745246642734e-05, |
|
"loss": 2.9, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 26.62544874351815, |
|
"grad_norm": 6.474231243133545, |
|
"learning_rate": 4.729125116340912e-05, |
|
"loss": 2.8729, |
|
"step": 133500 |
|
}, |
|
{ |
|
"epoch": 26.72516952532908, |
|
"grad_norm": 7.071346759796143, |
|
"learning_rate": 4.712538226299694e-05, |
|
"loss": 2.878, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 26.82489030714001, |
|
"grad_norm": 7.4629740715026855, |
|
"learning_rate": 4.695918095997873e-05, |
|
"loss": 2.8949, |
|
"step": 134500 |
|
}, |
|
{ |
|
"epoch": 26.924611088950936, |
|
"grad_norm": 7.166282653808594, |
|
"learning_rate": 4.679331205956655e-05, |
|
"loss": 2.8834, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 27.024331870761866, |
|
"grad_norm": 7.213958263397217, |
|
"learning_rate": 4.6627110756548334e-05, |
|
"loss": 2.8722, |
|
"step": 135500 |
|
}, |
|
{ |
|
"epoch": 27.124052652572797, |
|
"grad_norm": 6.917830467224121, |
|
"learning_rate": 4.6460909453530116e-05, |
|
"loss": 2.812, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 27.223773434383727, |
|
"grad_norm": 7.030029296875, |
|
"learning_rate": 4.62947081505119e-05, |
|
"loss": 2.7973, |
|
"step": 136500 |
|
}, |
|
{ |
|
"epoch": 27.323494216194653, |
|
"grad_norm": 6.927401542663574, |
|
"learning_rate": 4.6128506847493686e-05, |
|
"loss": 2.8567, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 27.423214998005584, |
|
"grad_norm": 7.063901424407959, |
|
"learning_rate": 4.596230554447547e-05, |
|
"loss": 2.8119, |
|
"step": 137500 |
|
}, |
|
{ |
|
"epoch": 27.522935779816514, |
|
"grad_norm": 6.619449138641357, |
|
"learning_rate": 4.5796104241457256e-05, |
|
"loss": 2.814, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 27.622656561627444, |
|
"grad_norm": 6.861698150634766, |
|
"learning_rate": 4.562990293843904e-05, |
|
"loss": 2.7966, |
|
"step": 138500 |
|
}, |
|
{ |
|
"epoch": 27.72237734343837, |
|
"grad_norm": 5.698707580566406, |
|
"learning_rate": 4.5464034038026856e-05, |
|
"loss": 2.8274, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 27.8220981252493, |
|
"grad_norm": 6.638801574707031, |
|
"learning_rate": 4.5297832735008645e-05, |
|
"loss": 2.8111, |
|
"step": 139500 |
|
}, |
|
{ |
|
"epoch": 27.92181890706023, |
|
"grad_norm": 7.414352893829346, |
|
"learning_rate": 4.5131631431990427e-05, |
|
"loss": 2.8219, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 28.02153968887116, |
|
"grad_norm": 7.000102519989014, |
|
"learning_rate": 4.4965430128972215e-05, |
|
"loss": 2.8059, |
|
"step": 140500 |
|
}, |
|
{ |
|
"epoch": 28.121260470682092, |
|
"grad_norm": 7.648940563201904, |
|
"learning_rate": 4.4799561228560034e-05, |
|
"loss": 2.7801, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 28.22098125249302, |
|
"grad_norm": 6.238720417022705, |
|
"learning_rate": 4.4633359925541815e-05, |
|
"loss": 2.7611, |
|
"step": 141500 |
|
}, |
|
{ |
|
"epoch": 28.32070203430395, |
|
"grad_norm": 7.083422660827637, |
|
"learning_rate": 4.4467491025129634e-05, |
|
"loss": 2.7476, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 28.42042281611488, |
|
"grad_norm": 7.1048760414123535, |
|
"learning_rate": 4.430128972211142e-05, |
|
"loss": 2.7601, |
|
"step": 142500 |
|
}, |
|
{ |
|
"epoch": 28.52014359792581, |
|
"grad_norm": 6.950742244720459, |
|
"learning_rate": 4.4135088419093204e-05, |
|
"loss": 2.7615, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 28.619864379736736, |
|
"grad_norm": 7.063054084777832, |
|
"learning_rate": 4.396888711607499e-05, |
|
"loss": 2.7583, |
|
"step": 143500 |
|
}, |
|
{ |
|
"epoch": 28.719585161547666, |
|
"grad_norm": 6.951484680175781, |
|
"learning_rate": 4.3802685813056774e-05, |
|
"loss": 2.748, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 28.819305943358597, |
|
"grad_norm": 7.212677955627441, |
|
"learning_rate": 4.363648451003856e-05, |
|
"loss": 2.7542, |
|
"step": 144500 |
|
}, |
|
{ |
|
"epoch": 28.919026725169527, |
|
"grad_norm": 6.691658973693848, |
|
"learning_rate": 4.3470283207020344e-05, |
|
"loss": 2.753, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 29.018747506980453, |
|
"grad_norm": 7.1954874992370605, |
|
"learning_rate": 4.330408190400213e-05, |
|
"loss": 2.7332, |
|
"step": 145500 |
|
}, |
|
{ |
|
"epoch": 29.118468288791384, |
|
"grad_norm": 6.654098987579346, |
|
"learning_rate": 4.313821300358995e-05, |
|
"loss": 2.7109, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 29.218189070602314, |
|
"grad_norm": 6.924403667449951, |
|
"learning_rate": 4.297201170057173e-05, |
|
"loss": 2.7076, |
|
"step": 146500 |
|
}, |
|
{ |
|
"epoch": 29.317909852413244, |
|
"grad_norm": 7.731849193572998, |
|
"learning_rate": 4.280581039755352e-05, |
|
"loss": 2.6943, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 29.41763063422417, |
|
"grad_norm": 7.095526218414307, |
|
"learning_rate": 4.26396090945353e-05, |
|
"loss": 2.72, |
|
"step": 147500 |
|
}, |
|
{ |
|
"epoch": 29.5173514160351, |
|
"grad_norm": 7.1939520835876465, |
|
"learning_rate": 4.247340779151709e-05, |
|
"loss": 2.6772, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 29.61707219784603, |
|
"grad_norm": 7.466503620147705, |
|
"learning_rate": 4.230753889110491e-05, |
|
"loss": 2.7193, |
|
"step": 148500 |
|
}, |
|
{ |
|
"epoch": 29.71679297965696, |
|
"grad_norm": 6.902263164520264, |
|
"learning_rate": 4.214133758808669e-05, |
|
"loss": 2.716, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 29.81651376146789, |
|
"grad_norm": 7.366625785827637, |
|
"learning_rate": 4.197513628506848e-05, |
|
"loss": 2.7009, |
|
"step": 149500 |
|
}, |
|
{ |
|
"epoch": 29.91623454327882, |
|
"grad_norm": 6.991941452026367, |
|
"learning_rate": 4.180893498205026e-05, |
|
"loss": 2.7202, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"step": 150420, |
|
"total_flos": 1.5839169150106368e+17, |
|
"train_loss": 0.47119966579742084, |
|
"train_runtime": 6930.0607, |
|
"train_samples_per_second": 347.265, |
|
"train_steps_per_second": 21.705 |
|
}, |
|
{ |
|
"epoch": 30.01595532508975, |
|
"grad_norm": 6.077478885650635, |
|
"learning_rate": 4.997720667844322e-05, |
|
"loss": 2.7286, |
|
"step": 150500 |
|
}, |
|
{ |
|
"epoch": 30.11567610690068, |
|
"grad_norm": 6.566033363342285, |
|
"learning_rate": 4.983474841871332e-05, |
|
"loss": 2.7319, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 30.215396888711606, |
|
"grad_norm": 7.486234188079834, |
|
"learning_rate": 4.969229015898342e-05, |
|
"loss": 2.7899, |
|
"step": 151500 |
|
}, |
|
{ |
|
"epoch": 30.315117670522536, |
|
"grad_norm": 7.640929222106934, |
|
"learning_rate": 4.954983189925352e-05, |
|
"loss": 2.7598, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 30.414838452333466, |
|
"grad_norm": 7.036547660827637, |
|
"learning_rate": 4.940737363952362e-05, |
|
"loss": 2.754, |
|
"step": 152500 |
|
}, |
|
{ |
|
"epoch": 30.514559234144397, |
|
"grad_norm": 7.128058910369873, |
|
"learning_rate": 4.926491537979372e-05, |
|
"loss": 2.7888, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 30.614280015955327, |
|
"grad_norm": 7.1788249015808105, |
|
"learning_rate": 4.912245712006382e-05, |
|
"loss": 2.7662, |
|
"step": 153500 |
|
}, |
|
{ |
|
"epoch": 30.714000797766253, |
|
"grad_norm": 7.081215858459473, |
|
"learning_rate": 4.897999886033392e-05, |
|
"loss": 2.7722, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 30.813721579577184, |
|
"grad_norm": 6.131695747375488, |
|
"learning_rate": 4.883754060060402e-05, |
|
"loss": 2.7464, |
|
"step": 154500 |
|
}, |
|
{ |
|
"epoch": 30.913442361388114, |
|
"grad_norm": 6.66817569732666, |
|
"learning_rate": 4.869508234087412e-05, |
|
"loss": 2.7352, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 31.013163143199044, |
|
"grad_norm": 7.4430952072143555, |
|
"learning_rate": 4.8552908997663685e-05, |
|
"loss": 2.7503, |
|
"step": 155500 |
|
}, |
|
{ |
|
"epoch": 31.11288392500997, |
|
"grad_norm": 7.984841346740723, |
|
"learning_rate": 4.8410450737933786e-05, |
|
"loss": 2.6821, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 31.2126047068209, |
|
"grad_norm": 7.386984348297119, |
|
"learning_rate": 4.8267992478203886e-05, |
|
"loss": 2.6916, |
|
"step": 156500 |
|
}, |
|
{ |
|
"epoch": 31.31232548863183, |
|
"grad_norm": 6.3857951164245605, |
|
"learning_rate": 4.8125534218473987e-05, |
|
"loss": 2.6826, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 31.41204627044276, |
|
"grad_norm": 7.394888401031494, |
|
"learning_rate": 4.798307595874409e-05, |
|
"loss": 2.7099, |
|
"step": 157500 |
|
}, |
|
{ |
|
"epoch": 31.51176705225369, |
|
"grad_norm": 7.39955997467041, |
|
"learning_rate": 4.784061769901419e-05, |
|
"loss": 2.7056, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 31.61148783406462, |
|
"grad_norm": 6.624033451080322, |
|
"learning_rate": 4.769844435580375e-05, |
|
"loss": 2.6903, |
|
"step": 158500 |
|
}, |
|
{ |
|
"epoch": 31.71120861587555, |
|
"grad_norm": 6.656693458557129, |
|
"learning_rate": 4.755627101259331e-05, |
|
"loss": 2.6877, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 31.81092939768648, |
|
"grad_norm": 7.474542140960693, |
|
"learning_rate": 4.741381275286341e-05, |
|
"loss": 2.6965, |
|
"step": 159500 |
|
}, |
|
{ |
|
"epoch": 31.910650179497406, |
|
"grad_norm": 7.388774394989014, |
|
"learning_rate": 4.727135449313351e-05, |
|
"loss": 2.7145, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 32.01037096130834, |
|
"grad_norm": 7.423541069030762, |
|
"learning_rate": 4.712889623340361e-05, |
|
"loss": 2.6943, |
|
"step": 160500 |
|
}, |
|
{ |
|
"epoch": 32.11009174311926, |
|
"grad_norm": 6.063508033752441, |
|
"learning_rate": 4.698643797367371e-05, |
|
"loss": 2.6214, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 32.20981252493019, |
|
"grad_norm": 7.619082450866699, |
|
"learning_rate": 4.6843979713943814e-05, |
|
"loss": 2.6318, |
|
"step": 161500 |
|
}, |
|
{ |
|
"epoch": 32.30953330674112, |
|
"grad_norm": 6.978066921234131, |
|
"learning_rate": 4.670152145421392e-05, |
|
"loss": 2.6327, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 32.40925408855205, |
|
"grad_norm": 6.166346073150635, |
|
"learning_rate": 4.655906319448402e-05, |
|
"loss": 2.6419, |
|
"step": 162500 |
|
}, |
|
{ |
|
"epoch": 32.508974870362984, |
|
"grad_norm": 7.364738464355469, |
|
"learning_rate": 4.641660493475412e-05, |
|
"loss": 2.6356, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 32.608695652173914, |
|
"grad_norm": 7.476531982421875, |
|
"learning_rate": 4.627414667502422e-05, |
|
"loss": 2.6344, |
|
"step": 163500 |
|
}, |
|
{ |
|
"epoch": 32.708416433984844, |
|
"grad_norm": 7.627068042755127, |
|
"learning_rate": 4.613168841529432e-05, |
|
"loss": 2.6434, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 32.808137215795774, |
|
"grad_norm": 7.334908962249756, |
|
"learning_rate": 4.598923015556442e-05, |
|
"loss": 2.663, |
|
"step": 164500 |
|
}, |
|
{ |
|
"epoch": 32.907857997606705, |
|
"grad_norm": 6.580120086669922, |
|
"learning_rate": 4.5847341728873446e-05, |
|
"loss": 2.6406, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 33.00757877941763, |
|
"grad_norm": 6.953055381774902, |
|
"learning_rate": 4.570488346914355e-05, |
|
"loss": 2.6517, |
|
"step": 165500 |
|
}, |
|
{ |
|
"epoch": 33.10729956122856, |
|
"grad_norm": 6.980926036834717, |
|
"learning_rate": 4.556242520941365e-05, |
|
"loss": 2.589, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 33.20702034303949, |
|
"grad_norm": 7.215412616729736, |
|
"learning_rate": 4.541996694968375e-05, |
|
"loss": 2.5831, |
|
"step": 166500 |
|
}, |
|
{ |
|
"epoch": 33.30674112485042, |
|
"grad_norm": 7.203444004058838, |
|
"learning_rate": 4.527750868995385e-05, |
|
"loss": 2.5739, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 33.40646190666135, |
|
"grad_norm": 5.696502685546875, |
|
"learning_rate": 4.513505043022395e-05, |
|
"loss": 2.604, |
|
"step": 167500 |
|
}, |
|
{ |
|
"epoch": 33.50618268847228, |
|
"grad_norm": 6.160342216491699, |
|
"learning_rate": 4.499259217049405e-05, |
|
"loss": 2.5848, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 33.60590347028321, |
|
"grad_norm": 6.758869171142578, |
|
"learning_rate": 4.485013391076415e-05, |
|
"loss": 2.6157, |
|
"step": 168500 |
|
}, |
|
{ |
|
"epoch": 33.70562425209414, |
|
"grad_norm": 7.064002513885498, |
|
"learning_rate": 4.4708245484073166e-05, |
|
"loss": 2.5765, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 33.80534503390506, |
|
"grad_norm": 7.993391513824463, |
|
"learning_rate": 4.4565787224343267e-05, |
|
"loss": 2.6115, |
|
"step": 169500 |
|
}, |
|
{ |
|
"epoch": 33.90506581571599, |
|
"grad_norm": 7.196022033691406, |
|
"learning_rate": 4.442332896461337e-05, |
|
"loss": 2.591, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 34.00478659752692, |
|
"grad_norm": 8.118667602539062, |
|
"learning_rate": 4.428115562140293e-05, |
|
"loss": 2.5833, |
|
"step": 170500 |
|
}, |
|
{ |
|
"epoch": 34.10450737933785, |
|
"grad_norm": 7.465199947357178, |
|
"learning_rate": 4.413869736167303e-05, |
|
"loss": 2.5509, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 34.204228161148784, |
|
"grad_norm": 6.739304542541504, |
|
"learning_rate": 4.399623910194313e-05, |
|
"loss": 2.5357, |
|
"step": 171500 |
|
}, |
|
{ |
|
"epoch": 34.303948942959714, |
|
"grad_norm": 6.758444786071777, |
|
"learning_rate": 4.385378084221323e-05, |
|
"loss": 2.567, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 34.403669724770644, |
|
"grad_norm": 6.511049270629883, |
|
"learning_rate": 4.371132258248333e-05, |
|
"loss": 2.5759, |
|
"step": 172500 |
|
}, |
|
{ |
|
"epoch": 34.503390506581574, |
|
"grad_norm": 7.730967044830322, |
|
"learning_rate": 4.356886432275343e-05, |
|
"loss": 2.5494, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 34.6031112883925, |
|
"grad_norm": 6.543623924255371, |
|
"learning_rate": 4.342640606302353e-05, |
|
"loss": 2.5482, |
|
"step": 173500 |
|
}, |
|
{ |
|
"epoch": 34.70283207020343, |
|
"grad_norm": 7.216828346252441, |
|
"learning_rate": 4.328394780329364e-05, |
|
"loss": 2.5593, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 34.80255285201436, |
|
"grad_norm": 6.891706943511963, |
|
"learning_rate": 4.3141774460083194e-05, |
|
"loss": 2.5409, |
|
"step": 174500 |
|
}, |
|
{ |
|
"epoch": 34.90227363382529, |
|
"grad_norm": 7.4927778244018555, |
|
"learning_rate": 4.29993162003533e-05, |
|
"loss": 2.5673, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"step": 175490, |
|
"total_flos": 1.8479030675124096e+17, |
|
"train_loss": 0.37831091759340585, |
|
"train_runtime": 6392.496, |
|
"train_samples_per_second": 439.213, |
|
"train_steps_per_second": 27.453 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 175490, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 35, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.8479030675124096e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|