roberta-javanese / trainer_state.json
akahana's picture
End of training
226b105 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 35.0,
"eval_steps": 500,
"global_step": 175490,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0997207818109294,
"grad_norm": 2.346997022628784,
"learning_rate": 4.5023932987634625e-05,
"loss": 8.2424,
"step": 500
},
{
"epoch": 0.1994415636218588,
"grad_norm": 2.3684158325195312,
"learning_rate": 4.0037893897088155e-05,
"loss": 7.6851,
"step": 1000
},
{
"epoch": 0.2991623454327882,
"grad_norm": 3.409303665161133,
"learning_rate": 3.5051854806541686e-05,
"loss": 7.4872,
"step": 1500
},
{
"epoch": 0.3988831272437176,
"grad_norm": 2.615360975265503,
"learning_rate": 3.0065815715995216e-05,
"loss": 7.344,
"step": 2000
},
{
"epoch": 0.49860390905464697,
"grad_norm": 3.5242176055908203,
"learning_rate": 2.5079776625448743e-05,
"loss": 7.2749,
"step": 2500
},
{
"epoch": 0.5983246908655764,
"grad_norm": 3.690262794494629,
"learning_rate": 2.0093737534902273e-05,
"loss": 7.1657,
"step": 3000
},
{
"epoch": 0.6980454726765057,
"grad_norm": 2.940692663192749,
"learning_rate": 1.5107698444355806e-05,
"loss": 7.1298,
"step": 3500
},
{
"epoch": 0.7977662544874352,
"grad_norm": 2.9132378101348877,
"learning_rate": 1.0121659353809334e-05,
"loss": 7.0938,
"step": 4000
},
{
"epoch": 0.8974870362983646,
"grad_norm": 3.101921558380127,
"learning_rate": 5.135620263262864e-06,
"loss": 7.0715,
"step": 4500
},
{
"epoch": 0.9972078181092939,
"grad_norm": 3.2258358001708984,
"learning_rate": 1.495811727163941e-07,
"loss": 7.0478,
"step": 5000
},
{
"epoch": 1.0969285999202234,
"grad_norm": 3.2722208499908447,
"learning_rate": 3.903270841643399e-05,
"loss": 7.0374,
"step": 5500
},
{
"epoch": 1.1966493817311528,
"grad_norm": 5.218217849731445,
"learning_rate": 3.803550059832469e-05,
"loss": 7.0289,
"step": 6000
},
{
"epoch": 1.2963701635420821,
"grad_norm": 3.466571807861328,
"learning_rate": 3.70382927802154e-05,
"loss": 6.9595,
"step": 6500
},
{
"epoch": 1.3960909453530115,
"grad_norm": 3.688443183898926,
"learning_rate": 3.6041084962106106e-05,
"loss": 6.9267,
"step": 7000
},
{
"epoch": 1.4958117271639408,
"grad_norm": 3.0426700115203857,
"learning_rate": 3.504387714399681e-05,
"loss": 6.8954,
"step": 7500
},
{
"epoch": 1.5955325089748702,
"grad_norm": 3.7769949436187744,
"learning_rate": 3.404666932588751e-05,
"loss": 6.8657,
"step": 8000
},
{
"epoch": 1.6952532907857998,
"grad_norm": 3.0776305198669434,
"learning_rate": 3.304946150777822e-05,
"loss": 6.8285,
"step": 8500
},
{
"epoch": 1.7949740725967291,
"grad_norm": 3.350515604019165,
"learning_rate": 3.2052253689668926e-05,
"loss": 6.7948,
"step": 9000
},
{
"epoch": 1.8946948544076585,
"grad_norm": 3.393035411834717,
"learning_rate": 3.1055045871559636e-05,
"loss": 6.7725,
"step": 9500
},
{
"epoch": 1.994415636218588,
"grad_norm": 3.438401222229004,
"learning_rate": 3.0057838053450336e-05,
"loss": 6.7484,
"step": 10000
},
{
"epoch": 2.0941364180295174,
"grad_norm": 4.042023181915283,
"learning_rate": 2.9060630235341047e-05,
"loss": 6.6939,
"step": 10500
},
{
"epoch": 2.193857199840447,
"grad_norm": 3.3481028079986572,
"learning_rate": 2.8063422417231757e-05,
"loss": 6.6854,
"step": 11000
},
{
"epoch": 2.293577981651376,
"grad_norm": 3.266961097717285,
"learning_rate": 2.706820901475868e-05,
"loss": 6.6555,
"step": 11500
},
{
"epoch": 2.3932987634623055,
"grad_norm": 3.215405225753784,
"learning_rate": 2.607100119664938e-05,
"loss": 6.6713,
"step": 12000
},
{
"epoch": 2.493019545273235,
"grad_norm": 3.380500316619873,
"learning_rate": 2.507379337854009e-05,
"loss": 6.6581,
"step": 12500
},
{
"epoch": 2.5927403270841642,
"grad_norm": 3.536166191101074,
"learning_rate": 2.4076585560430796e-05,
"loss": 6.5945,
"step": 13000
},
{
"epoch": 2.6924611088950936,
"grad_norm": 3.9319474697113037,
"learning_rate": 2.30793777423215e-05,
"loss": 6.6057,
"step": 13500
},
{
"epoch": 2.792181890706023,
"grad_norm": 4.334239482879639,
"learning_rate": 2.2084164339848425e-05,
"loss": 6.5818,
"step": 14000
},
{
"epoch": 2.8919026725169523,
"grad_norm": 4.093286514282227,
"learning_rate": 2.1086956521739132e-05,
"loss": 6.5732,
"step": 14500
},
{
"epoch": 2.9916234543278817,
"grad_norm": 4.026576995849609,
"learning_rate": 2.008974870362984e-05,
"loss": 6.5627,
"step": 15000
},
{
"epoch": 3.0913442361388115,
"grad_norm": 3.7285637855529785,
"learning_rate": 1.9092540885520542e-05,
"loss": 6.5268,
"step": 15500
},
{
"epoch": 3.191065017949741,
"grad_norm": 3.7349226474761963,
"learning_rate": 1.809533306741125e-05,
"loss": 6.5388,
"step": 16000
},
{
"epoch": 3.29078579976067,
"grad_norm": 3.5330066680908203,
"learning_rate": 1.7098125249301956e-05,
"loss": 6.5141,
"step": 16500
},
{
"epoch": 3.3905065815715996,
"grad_norm": 3.6961631774902344,
"learning_rate": 1.6100917431192662e-05,
"loss": 6.5013,
"step": 17000
},
{
"epoch": 3.490227363382529,
"grad_norm": 3.413053274154663,
"learning_rate": 1.5103709613083367e-05,
"loss": 6.4932,
"step": 17500
},
{
"epoch": 3.5899481451934583,
"grad_norm": 4.584457874298096,
"learning_rate": 1.4108496210610292e-05,
"loss": 6.4695,
"step": 18000
},
{
"epoch": 3.6896689270043876,
"grad_norm": 3.3078787326812744,
"learning_rate": 1.3111288392500998e-05,
"loss": 6.4711,
"step": 18500
},
{
"epoch": 3.789389708815317,
"grad_norm": 3.6679279804229736,
"learning_rate": 1.2114080574391703e-05,
"loss": 6.466,
"step": 19000
},
{
"epoch": 3.8891104906262464,
"grad_norm": 4.358784198760986,
"learning_rate": 1.1116872756282408e-05,
"loss": 6.4568,
"step": 19500
},
{
"epoch": 3.988831272437176,
"grad_norm": 4.014244556427002,
"learning_rate": 1.0119664938173115e-05,
"loss": 6.4536,
"step": 20000
},
{
"epoch": 4.0885520542481055,
"grad_norm": 3.8396079540252686,
"learning_rate": 9.122457120063822e-06,
"loss": 6.443,
"step": 20500
},
{
"epoch": 4.188272836059035,
"grad_norm": 3.850647449493408,
"learning_rate": 8.125249301954529e-06,
"loss": 6.4186,
"step": 21000
},
{
"epoch": 4.287993617869964,
"grad_norm": 3.829951047897339,
"learning_rate": 7.128041483845234e-06,
"loss": 6.4178,
"step": 21500
},
{
"epoch": 4.387714399680894,
"grad_norm": 3.5512278079986572,
"learning_rate": 6.132828081372159e-06,
"loss": 6.4055,
"step": 22000
},
{
"epoch": 4.487435181491823,
"grad_norm": 3.568665027618408,
"learning_rate": 5.135620263262864e-06,
"loss": 6.4076,
"step": 22500
},
{
"epoch": 4.587155963302752,
"grad_norm": 3.71463942527771,
"learning_rate": 4.13841244515357e-06,
"loss": 6.4086,
"step": 23000
},
{
"epoch": 4.686876745113682,
"grad_norm": 3.9615983963012695,
"learning_rate": 3.1412046270442757e-06,
"loss": 6.4061,
"step": 23500
},
{
"epoch": 4.786597526924611,
"grad_norm": 4.0287909507751465,
"learning_rate": 2.1459912245712007e-06,
"loss": 6.3772,
"step": 24000
},
{
"epoch": 4.88631830873554,
"grad_norm": 4.012565612792969,
"learning_rate": 1.1487834064619066e-06,
"loss": 6.3956,
"step": 24500
},
{
"epoch": 4.98603909054647,
"grad_norm": 4.36814022064209,
"learning_rate": 1.515755883526127e-07,
"loss": 6.3996,
"step": 25000
},
{
"epoch": 5.0,
"step": 25070,
"total_flos": 2.639861525017728e+16,
"train_loss": 5.285465436767286,
"train_runtime": 6500.188,
"train_samples_per_second": 61.705,
"train_steps_per_second": 3.857
},
{
"epoch": 5.085759872357399,
"grad_norm": 3.5418105125427246,
"learning_rate": 4.946400079776626e-05,
"loss": 6.5458,
"step": 25500
},
{
"epoch": 5.1854806541683285,
"grad_norm": 4.323005676269531,
"learning_rate": 4.884074591144795e-05,
"loss": 6.5604,
"step": 26000
},
{
"epoch": 5.285201435979258,
"grad_norm": 4.445618629455566,
"learning_rate": 4.8217491025129644e-05,
"loss": 6.5452,
"step": 26500
},
{
"epoch": 5.384922217790187,
"grad_norm": 4.320890426635742,
"learning_rate": 4.759423613881133e-05,
"loss": 6.5239,
"step": 27000
},
{
"epoch": 5.484642999601117,
"grad_norm": 3.8980209827423096,
"learning_rate": 4.697098125249302e-05,
"loss": 6.5278,
"step": 27500
},
{
"epoch": 5.584363781412046,
"grad_norm": 4.074916362762451,
"learning_rate": 4.6347726366174716e-05,
"loss": 6.5044,
"step": 28000
},
{
"epoch": 5.684084563222975,
"grad_norm": 4.465285778045654,
"learning_rate": 4.572447147985641e-05,
"loss": 6.472,
"step": 28500
},
{
"epoch": 5.783805345033905,
"grad_norm": 4.351347923278809,
"learning_rate": 4.5101216593538095e-05,
"loss": 6.4504,
"step": 29000
},
{
"epoch": 5.883526126844835,
"grad_norm": 4.14565372467041,
"learning_rate": 4.447796170721978e-05,
"loss": 6.4375,
"step": 29500
},
{
"epoch": 5.983246908655763,
"grad_norm": 4.669959545135498,
"learning_rate": 4.3854706820901474e-05,
"loss": 6.4393,
"step": 30000
},
{
"epoch": 6.082967690466694,
"grad_norm": 4.345717430114746,
"learning_rate": 4.323145193458317e-05,
"loss": 6.3808,
"step": 30500
},
{
"epoch": 6.182688472277623,
"grad_norm": 4.040054798126221,
"learning_rate": 4.260819704826486e-05,
"loss": 6.3705,
"step": 31000
},
{
"epoch": 6.282409254088552,
"grad_norm": 4.663171291351318,
"learning_rate": 4.198618867171919e-05,
"loss": 6.3803,
"step": 31500
},
{
"epoch": 6.382130035899482,
"grad_norm": 4.45890474319458,
"learning_rate": 4.136293378540088e-05,
"loss": 6.3256,
"step": 32000
},
{
"epoch": 6.481850817710411,
"grad_norm": 4.158110618591309,
"learning_rate": 4.073967889908257e-05,
"loss": 6.3351,
"step": 32500
},
{
"epoch": 6.58157159952134,
"grad_norm": 4.460795879364014,
"learning_rate": 4.0116424012764265e-05,
"loss": 6.3137,
"step": 33000
},
{
"epoch": 6.68129238133227,
"grad_norm": 4.767895221710205,
"learning_rate": 3.949316912644596e-05,
"loss": 6.2751,
"step": 33500
},
{
"epoch": 6.781013163143199,
"grad_norm": 4.399994850158691,
"learning_rate": 3.887116074990028e-05,
"loss": 6.2345,
"step": 34000
},
{
"epoch": 6.8807339449541285,
"grad_norm": 4.522914886474609,
"learning_rate": 3.8247905863581976e-05,
"loss": 6.218,
"step": 34500
},
{
"epoch": 6.980454726765058,
"grad_norm": 4.697731018066406,
"learning_rate": 3.762465097726366e-05,
"loss": 6.1819,
"step": 35000
},
{
"epoch": 7.080175508575987,
"grad_norm": 5.113608360290527,
"learning_rate": 3.7001396090945355e-05,
"loss": 6.1566,
"step": 35500
},
{
"epoch": 7.179896290386917,
"grad_norm": 4.987142086029053,
"learning_rate": 3.637814120462705e-05,
"loss": 6.1504,
"step": 36000
},
{
"epoch": 7.279617072197846,
"grad_norm": 4.797494888305664,
"learning_rate": 3.5756132828081373e-05,
"loss": 6.0915,
"step": 36500
},
{
"epoch": 7.379337854008775,
"grad_norm": 5.114543437957764,
"learning_rate": 3.5132877941763066e-05,
"loss": 6.0859,
"step": 37000
},
{
"epoch": 7.479058635819705,
"grad_norm": 5.5212721824646,
"learning_rate": 3.450962305544476e-05,
"loss": 6.0643,
"step": 37500
},
{
"epoch": 7.578779417630634,
"grad_norm": 4.77981424331665,
"learning_rate": 3.3886368169126446e-05,
"loss": 6.038,
"step": 38000
},
{
"epoch": 7.678500199441563,
"grad_norm": 5.6912760734558105,
"learning_rate": 3.326311328280814e-05,
"loss": 6.0327,
"step": 38500
},
{
"epoch": 7.778220981252493,
"grad_norm": 5.021594524383545,
"learning_rate": 3.2641104906262464e-05,
"loss": 6.0089,
"step": 39000
},
{
"epoch": 7.877941763063422,
"grad_norm": 4.9512410163879395,
"learning_rate": 3.201785001994416e-05,
"loss": 5.9914,
"step": 39500
},
{
"epoch": 7.9776625448743514,
"grad_norm": 4.6659088134765625,
"learning_rate": 3.139459513362585e-05,
"loss": 5.9688,
"step": 40000
},
{
"epoch": 8.07738332668528,
"grad_norm": 5.084179401397705,
"learning_rate": 3.601552017986003e-05,
"loss": 5.9368,
"step": 40500
},
{
"epoch": 8.177104108496211,
"grad_norm": 5.475657939910889,
"learning_rate": 3.556224389890126e-05,
"loss": 5.9181,
"step": 41000
},
{
"epoch": 8.27682489030714,
"grad_norm": 4.678411960601807,
"learning_rate": 3.510896761794249e-05,
"loss": 5.8795,
"step": 41500
},
{
"epoch": 8.37654567211807,
"grad_norm": 5.502169132232666,
"learning_rate": 3.465569133698372e-05,
"loss": 5.8389,
"step": 42000
},
{
"epoch": 8.476266453928998,
"grad_norm": 5.32131290435791,
"learning_rate": 3.420241505602495e-05,
"loss": 5.8329,
"step": 42500
},
{
"epoch": 8.575987235739928,
"grad_norm": 5.6808552742004395,
"learning_rate": 3.374913877506618e-05,
"loss": 5.8001,
"step": 43000
},
{
"epoch": 8.675708017550857,
"grad_norm": 4.988351821899414,
"learning_rate": 3.329586249410741e-05,
"loss": 5.7928,
"step": 43500
},
{
"epoch": 8.775428799361787,
"grad_norm": 5.559896469116211,
"learning_rate": 3.284258621314864e-05,
"loss": 5.7488,
"step": 44000
},
{
"epoch": 8.875149581172716,
"grad_norm": 6.084516525268555,
"learning_rate": 3.238930993218987e-05,
"loss": 5.7262,
"step": 44500
},
{
"epoch": 8.974870362983646,
"grad_norm": 6.219081401824951,
"learning_rate": 3.19360336512311e-05,
"loss": 5.6925,
"step": 45000
},
{
"epoch": 9.074591144794574,
"grad_norm": 6.170139789581299,
"learning_rate": 3.1482757370272333e-05,
"loss": 5.6491,
"step": 45500
},
{
"epoch": 9.174311926605505,
"grad_norm": 5.830073356628418,
"learning_rate": 3.102948108931356e-05,
"loss": 5.6228,
"step": 46000
},
{
"epoch": 9.274032708416435,
"grad_norm": 5.452333927154541,
"learning_rate": 3.0577111360916706e-05,
"loss": 5.5724,
"step": 46500
},
{
"epoch": 9.373753490227363,
"grad_norm": 5.113864421844482,
"learning_rate": 3.0123835079957935e-05,
"loss": 5.5437,
"step": 47000
},
{
"epoch": 9.473474272038294,
"grad_norm": 5.875530242919922,
"learning_rate": 2.9670558798999164e-05,
"loss": 5.525,
"step": 47500
},
{
"epoch": 9.573195053849222,
"grad_norm": 5.342255592346191,
"learning_rate": 2.9217282518040397e-05,
"loss": 5.5145,
"step": 48000
},
{
"epoch": 9.672915835660152,
"grad_norm": 6.1103644371032715,
"learning_rate": 2.8764006237081626e-05,
"loss": 5.4687,
"step": 48500
},
{
"epoch": 9.77263661747108,
"grad_norm": 6.640170097351074,
"learning_rate": 2.8310729956122855e-05,
"loss": 5.4448,
"step": 49000
},
{
"epoch": 9.872357399282011,
"grad_norm": 6.135842323303223,
"learning_rate": 2.7858360227726005e-05,
"loss": 5.4075,
"step": 49500
},
{
"epoch": 9.97207818109294,
"grad_norm": 6.063602924346924,
"learning_rate": 2.7405083946767234e-05,
"loss": 5.374,
"step": 50000
},
{
"epoch": 10.07179896290387,
"grad_norm": 6.689053535461426,
"learning_rate": 2.6951807665808463e-05,
"loss": 5.3459,
"step": 50500
},
{
"epoch": 10.171519744714798,
"grad_norm": 6.488341331481934,
"learning_rate": 2.6498531384849696e-05,
"loss": 5.3185,
"step": 51000
},
{
"epoch": 10.271240526525728,
"grad_norm": 6.589330673217773,
"learning_rate": 2.6045255103890925e-05,
"loss": 5.3019,
"step": 51500
},
{
"epoch": 10.370961308336657,
"grad_norm": 6.61977481842041,
"learning_rate": 2.5592885375494075e-05,
"loss": 5.2792,
"step": 52000
},
{
"epoch": 10.470682090147587,
"grad_norm": 6.396610736846924,
"learning_rate": 2.5139609094535304e-05,
"loss": 5.2347,
"step": 52500
},
{
"epoch": 10.570402871958516,
"grad_norm": 7.000791549682617,
"learning_rate": 2.4686332813576534e-05,
"loss": 5.2252,
"step": 53000
},
{
"epoch": 10.670123653769446,
"grad_norm": 6.714987277984619,
"learning_rate": 2.4233056532617763e-05,
"loss": 5.1965,
"step": 53500
},
{
"epoch": 10.769844435580374,
"grad_norm": 7.012180805206299,
"learning_rate": 2.3779780251658992e-05,
"loss": 5.1769,
"step": 54000
},
{
"epoch": 10.869565217391305,
"grad_norm": 6.85835599899292,
"learning_rate": 2.332650397070022e-05,
"loss": 5.1442,
"step": 54500
},
{
"epoch": 10.969285999202233,
"grad_norm": 6.789878845214844,
"learning_rate": 2.2873227689741453e-05,
"loss": 5.1071,
"step": 55000
},
{
"epoch": 11.0,
"step": 55154,
"total_flos": 5.807695355039002e+16,
"train_loss": 1.5156397336923944,
"train_runtime": 4860.501,
"train_samples_per_second": 181.547,
"train_steps_per_second": 11.347
},
{
"epoch": 11.069006781013163,
"grad_norm": 7.099039077758789,
"learning_rate": 4.976997739662279e-05,
"loss": 5.2813,
"step": 55500
},
{
"epoch": 11.168727562824092,
"grad_norm": 6.935009479522705,
"learning_rate": 4.943757479058636e-05,
"loss": 5.2781,
"step": 56000
},
{
"epoch": 11.268448344635022,
"grad_norm": 8.239794731140137,
"learning_rate": 4.910517218454993e-05,
"loss": 5.2531,
"step": 56500
},
{
"epoch": 11.36816912644595,
"grad_norm": 6.757853031158447,
"learning_rate": 4.87727695785135e-05,
"loss": 5.1861,
"step": 57000
},
{
"epoch": 11.46788990825688,
"grad_norm": 7.666926383972168,
"learning_rate": 4.844036697247707e-05,
"loss": 5.1783,
"step": 57500
},
{
"epoch": 11.56761069006781,
"grad_norm": 7.166041374206543,
"learning_rate": 4.810796436644063e-05,
"loss": 5.1202,
"step": 58000
},
{
"epoch": 11.66733147187874,
"grad_norm": 7.543915748596191,
"learning_rate": 4.77755617604042e-05,
"loss": 5.0482,
"step": 58500
},
{
"epoch": 11.76705225368967,
"grad_norm": 8.00036907196045,
"learning_rate": 4.744315915436777e-05,
"loss": 5.0167,
"step": 59000
},
{
"epoch": 11.866773035500598,
"grad_norm": 6.7936272621154785,
"learning_rate": 4.711075654833134e-05,
"loss": 4.9823,
"step": 59500
},
{
"epoch": 11.966493817311529,
"grad_norm": 7.003523826599121,
"learning_rate": 4.677835394229491e-05,
"loss": 4.9457,
"step": 60000
},
{
"epoch": 12.066214599122457,
"grad_norm": 7.01780891418457,
"learning_rate": 4.644595133625848e-05,
"loss": 4.825,
"step": 60500
},
{
"epoch": 12.165935380933387,
"grad_norm": 7.654853820800781,
"learning_rate": 4.6113548730222045e-05,
"loss": 4.7741,
"step": 61000
},
{
"epoch": 12.265656162744316,
"grad_norm": 7.968235492706299,
"learning_rate": 4.578181092939769e-05,
"loss": 4.7404,
"step": 61500
},
{
"epoch": 12.365376944555246,
"grad_norm": 7.112838268280029,
"learning_rate": 4.544940832336126e-05,
"loss": 4.6502,
"step": 62000
},
{
"epoch": 12.465097726366174,
"grad_norm": 6.567187786102295,
"learning_rate": 4.511700571732483e-05,
"loss": 4.6277,
"step": 62500
},
{
"epoch": 12.564818508177105,
"grad_norm": 6.989046096801758,
"learning_rate": 4.478460311128839e-05,
"loss": 4.5757,
"step": 63000
},
{
"epoch": 12.664539289988033,
"grad_norm": 6.270955562591553,
"learning_rate": 4.445220050525196e-05,
"loss": 4.5394,
"step": 63500
},
{
"epoch": 12.764260071798963,
"grad_norm": 6.227508544921875,
"learning_rate": 4.412046270442761e-05,
"loss": 4.4651,
"step": 64000
},
{
"epoch": 12.863980853609892,
"grad_norm": 6.464995861053467,
"learning_rate": 4.378806009839118e-05,
"loss": 4.423,
"step": 64500
},
{
"epoch": 12.963701635420822,
"grad_norm": 6.102914810180664,
"learning_rate": 4.345565749235474e-05,
"loss": 4.3969,
"step": 65000
},
{
"epoch": 13.06342241723175,
"grad_norm": 6.3487067222595215,
"learning_rate": 4.312325488631831e-05,
"loss": 4.2689,
"step": 65500
},
{
"epoch": 13.16314319904268,
"grad_norm": 6.235875129699707,
"learning_rate": 4.279085228028188e-05,
"loss": 4.2232,
"step": 66000
},
{
"epoch": 13.26286398085361,
"grad_norm": 5.931600570678711,
"learning_rate": 4.245844967424545e-05,
"loss": 4.222,
"step": 66500
},
{
"epoch": 13.36258476266454,
"grad_norm": 5.873235702514648,
"learning_rate": 4.212604706820902e-05,
"loss": 4.1722,
"step": 67000
},
{
"epoch": 13.462305544475468,
"grad_norm": 6.30717134475708,
"learning_rate": 4.179364446217259e-05,
"loss": 4.1255,
"step": 67500
},
{
"epoch": 13.562026326286398,
"grad_norm": 5.893185138702393,
"learning_rate": 4.146190666134823e-05,
"loss": 4.0975,
"step": 68000
},
{
"epoch": 13.661747108097327,
"grad_norm": 6.775746822357178,
"learning_rate": 4.113016886052387e-05,
"loss": 4.0787,
"step": 68500
},
{
"epoch": 13.761467889908257,
"grad_norm": 5.948095798492432,
"learning_rate": 4.0797766254487435e-05,
"loss": 4.0581,
"step": 69000
},
{
"epoch": 13.861188671719185,
"grad_norm": 5.961909770965576,
"learning_rate": 4.0465363648451005e-05,
"loss": 4.0097,
"step": 69500
},
{
"epoch": 13.960909453530116,
"grad_norm": 5.72122859954834,
"learning_rate": 4.0132961042414575e-05,
"loss": 3.9751,
"step": 70000
},
{
"epoch": 14.060630235341046,
"grad_norm": 6.1757378578186035,
"learning_rate": 3.980122324159022e-05,
"loss": 3.9707,
"step": 70500
},
{
"epoch": 14.160351017151974,
"grad_norm": 5.7611236572265625,
"learning_rate": 3.946882063555378e-05,
"loss": 3.9126,
"step": 71000
},
{
"epoch": 14.260071798962905,
"grad_norm": 6.233034133911133,
"learning_rate": 3.913641802951735e-05,
"loss": 3.9005,
"step": 71500
},
{
"epoch": 14.359792580773833,
"grad_norm": 6.282217979431152,
"learning_rate": 3.880401542348092e-05,
"loss": 3.8648,
"step": 72000
},
{
"epoch": 14.459513362584763,
"grad_norm": 6.495648384094238,
"learning_rate": 3.847161281744449e-05,
"loss": 3.8567,
"step": 72500
},
{
"epoch": 14.559234144395692,
"grad_norm": 6.3030195236206055,
"learning_rate": 3.813921021140806e-05,
"loss": 3.839,
"step": 73000
},
{
"epoch": 14.658954926206622,
"grad_norm": 5.807531833648682,
"learning_rate": 3.78074724105837e-05,
"loss": 3.8156,
"step": 73500
},
{
"epoch": 14.75867570801755,
"grad_norm": 5.283077716827393,
"learning_rate": 3.747506980454727e-05,
"loss": 3.8142,
"step": 74000
},
{
"epoch": 14.85839648982848,
"grad_norm": 5.933303356170654,
"learning_rate": 3.714266719851084e-05,
"loss": 3.8109,
"step": 74500
},
{
"epoch": 14.95811727163941,
"grad_norm": 6.217842102050781,
"learning_rate": 3.681026459247441e-05,
"loss": 3.7937,
"step": 75000
},
{
"epoch": 15.0,
"step": 75210,
"total_flos": 7.919584575053184e+16,
"train_loss": 1.1771604976443562,
"train_runtime": 4937.567,
"train_samples_per_second": 243.7,
"train_steps_per_second": 15.232
},
{
"epoch": 15.05783805345034,
"grad_norm": 6.581785202026367,
"learning_rate": 4.985540486637415e-05,
"loss": 3.8491,
"step": 75500
},
{
"epoch": 15.157558835261268,
"grad_norm": 6.372396469116211,
"learning_rate": 4.960610291184683e-05,
"loss": 3.8838,
"step": 76000
},
{
"epoch": 15.257279617072198,
"grad_norm": 6.738864421844482,
"learning_rate": 4.935680095731951e-05,
"loss": 3.8834,
"step": 76500
},
{
"epoch": 15.357000398883127,
"grad_norm": 6.700061798095703,
"learning_rate": 4.910749900279219e-05,
"loss": 3.8559,
"step": 77000
},
{
"epoch": 15.456721180694057,
"grad_norm": 6.3839497566223145,
"learning_rate": 4.8858197048264857e-05,
"loss": 3.8275,
"step": 77500
},
{
"epoch": 15.556441962504985,
"grad_norm": 6.165511131286621,
"learning_rate": 4.860889509373754e-05,
"loss": 3.803,
"step": 78000
},
{
"epoch": 15.656162744315916,
"grad_norm": 5.800929069519043,
"learning_rate": 4.8359593139210215e-05,
"loss": 3.8,
"step": 78500
},
{
"epoch": 15.755883526126844,
"grad_norm": 6.714051246643066,
"learning_rate": 4.811029118468289e-05,
"loss": 3.797,
"step": 79000
},
{
"epoch": 15.855604307937774,
"grad_norm": 6.74777889251709,
"learning_rate": 4.786098923015557e-05,
"loss": 3.7759,
"step": 79500
},
{
"epoch": 15.955325089748703,
"grad_norm": 6.980929374694824,
"learning_rate": 4.761168727562824e-05,
"loss": 3.7445,
"step": 80000
},
{
"epoch": 16.05504587155963,
"grad_norm": 6.54088020324707,
"learning_rate": 4.736238532110092e-05,
"loss": 3.6805,
"step": 80500
},
{
"epoch": 16.15476665337056,
"grad_norm": 5.999478340148926,
"learning_rate": 4.7113083366573594e-05,
"loss": 3.6537,
"step": 81000
},
{
"epoch": 16.254487435181492,
"grad_norm": 6.384885311126709,
"learning_rate": 4.686378141204627e-05,
"loss": 3.6522,
"step": 81500
},
{
"epoch": 16.354208216992422,
"grad_norm": 6.624803066253662,
"learning_rate": 4.661447945751895e-05,
"loss": 3.6302,
"step": 82000
},
{
"epoch": 16.453928998803352,
"grad_norm": 6.454346656799316,
"learning_rate": 4.636567610690068e-05,
"loss": 3.6179,
"step": 82500
},
{
"epoch": 16.55364978061428,
"grad_norm": 6.266842365264893,
"learning_rate": 4.611637415237336e-05,
"loss": 3.6265,
"step": 83000
},
{
"epoch": 16.65337056242521,
"grad_norm": 6.608065128326416,
"learning_rate": 4.5867072197846036e-05,
"loss": 3.6105,
"step": 83500
},
{
"epoch": 16.75309134423614,
"grad_norm": 6.4489426612854,
"learning_rate": 4.5617770243318705e-05,
"loss": 3.5994,
"step": 84000
},
{
"epoch": 16.85281212604707,
"grad_norm": 6.433938503265381,
"learning_rate": 4.536896689270044e-05,
"loss": 3.5648,
"step": 84500
},
{
"epoch": 16.952532907857996,
"grad_norm": 7.4558610916137695,
"learning_rate": 4.511966493817312e-05,
"loss": 3.5746,
"step": 85000
},
{
"epoch": 17.052253689668927,
"grad_norm": 5.742049217224121,
"learning_rate": 4.4870362983645795e-05,
"loss": 3.5378,
"step": 85500
},
{
"epoch": 17.151974471479857,
"grad_norm": 6.346868515014648,
"learning_rate": 4.462106102911847e-05,
"loss": 3.505,
"step": 86000
},
{
"epoch": 17.251695253290787,
"grad_norm": 6.252668857574463,
"learning_rate": 4.4371759074591147e-05,
"loss": 3.4787,
"step": 86500
},
{
"epoch": 17.351416035101714,
"grad_norm": 6.237195014953613,
"learning_rate": 4.412245712006383e-05,
"loss": 3.4914,
"step": 87000
},
{
"epoch": 17.451136816912644,
"grad_norm": 7.106077194213867,
"learning_rate": 4.3873653769445554e-05,
"loss": 3.4641,
"step": 87500
},
{
"epoch": 17.550857598723574,
"grad_norm": 7.160710334777832,
"learning_rate": 4.362435181491823e-05,
"loss": 3.4419,
"step": 88000
},
{
"epoch": 17.650578380534505,
"grad_norm": 7.160135746002197,
"learning_rate": 4.337504986039091e-05,
"loss": 3.4604,
"step": 88500
},
{
"epoch": 17.75029916234543,
"grad_norm": 6.785101890563965,
"learning_rate": 4.312574790586358e-05,
"loss": 3.432,
"step": 89000
},
{
"epoch": 17.85001994415636,
"grad_norm": 5.990314960479736,
"learning_rate": 4.287644595133626e-05,
"loss": 3.4045,
"step": 89500
},
{
"epoch": 17.949740725967292,
"grad_norm": 6.434844493865967,
"learning_rate": 4.2627642600717995e-05,
"loss": 3.4236,
"step": 90000
},
{
"epoch": 18.049461507778222,
"grad_norm": 6.7937774658203125,
"learning_rate": 4.2378340646190664e-05,
"loss": 3.3902,
"step": 90500
},
{
"epoch": 18.14918228958915,
"grad_norm": 7.1783576011657715,
"learning_rate": 4.212903869166335e-05,
"loss": 3.3545,
"step": 91000
},
{
"epoch": 18.24890307140008,
"grad_norm": 6.374876022338867,
"learning_rate": 4.187973673713602e-05,
"loss": 3.3451,
"step": 91500
},
{
"epoch": 18.34862385321101,
"grad_norm": 6.49647331237793,
"learning_rate": 4.163093338651775e-05,
"loss": 3.3452,
"step": 92000
},
{
"epoch": 18.44834463502194,
"grad_norm": 6.785512924194336,
"learning_rate": 4.138163143199043e-05,
"loss": 3.3102,
"step": 92500
},
{
"epoch": 18.54806541683287,
"grad_norm": 6.842392921447754,
"learning_rate": 4.1132329477463106e-05,
"loss": 3.3376,
"step": 93000
},
{
"epoch": 18.647786198643796,
"grad_norm": 7.126637935638428,
"learning_rate": 4.088302752293578e-05,
"loss": 3.3249,
"step": 93500
},
{
"epoch": 18.747506980454727,
"grad_norm": 5.808903217315674,
"learning_rate": 4.063372556840846e-05,
"loss": 3.2808,
"step": 94000
},
{
"epoch": 18.847227762265657,
"grad_norm": 6.2346954345703125,
"learning_rate": 4.0385420821699245e-05,
"loss": 3.3189,
"step": 94500
},
{
"epoch": 18.946948544076587,
"grad_norm": 6.60822057723999,
"learning_rate": 4.013611886717192e-05,
"loss": 3.3143,
"step": 95000
},
{
"epoch": 19.046669325887514,
"grad_norm": 6.471176624298096,
"learning_rate": 3.9886816912644597e-05,
"loss": 3.2855,
"step": 95500
},
{
"epoch": 19.146390107698444,
"grad_norm": 6.365059852600098,
"learning_rate": 3.963751495811727e-05,
"loss": 3.2616,
"step": 96000
},
{
"epoch": 19.246110889509374,
"grad_norm": 6.250296592712402,
"learning_rate": 3.9388213003589955e-05,
"loss": 3.226,
"step": 96500
},
{
"epoch": 19.345831671320305,
"grad_norm": 6.003506660461426,
"learning_rate": 3.9138911049062624e-05,
"loss": 3.2352,
"step": 97000
},
{
"epoch": 19.44555245313123,
"grad_norm": 5.75541353225708,
"learning_rate": 3.88896090945353e-05,
"loss": 3.2395,
"step": 97500
},
{
"epoch": 19.54527323494216,
"grad_norm": 6.684996604919434,
"learning_rate": 3.864030714000798e-05,
"loss": 3.2272,
"step": 98000
},
{
"epoch": 19.644994016753092,
"grad_norm": 5.906820297241211,
"learning_rate": 3.839100518548066e-05,
"loss": 3.2096,
"step": 98500
},
{
"epoch": 19.744714798564022,
"grad_norm": 6.240872383117676,
"learning_rate": 3.814220183486238e-05,
"loss": 3.2016,
"step": 99000
},
{
"epoch": 19.84443558037495,
"grad_norm": 6.751197338104248,
"learning_rate": 3.7892899880335066e-05,
"loss": 3.2141,
"step": 99500
},
{
"epoch": 19.94415636218588,
"grad_norm": 6.535121917724609,
"learning_rate": 3.764359792580774e-05,
"loss": 3.1829,
"step": 100000
},
{
"epoch": 20.0,
"step": 100280,
"total_flos": 1.0559446100070912e+17,
"train_loss": 0.8717835233465437,
"train_runtime": 7516.252,
"train_samples_per_second": 213.455,
"train_steps_per_second": 13.342
},
{
"epoch": 20.04387714399681,
"grad_norm": 6.778732776641846,
"learning_rate": 4.9912245712006384e-05,
"loss": 3.2388,
"step": 100500
},
{
"epoch": 20.14359792580774,
"grad_norm": 6.354984760284424,
"learning_rate": 4.9712804148384526e-05,
"loss": 3.2858,
"step": 101000
},
{
"epoch": 20.243318707618666,
"grad_norm": 7.301539897918701,
"learning_rate": 4.951336258476267e-05,
"loss": 3.3016,
"step": 101500
},
{
"epoch": 20.343039489429597,
"grad_norm": 7.8318772315979,
"learning_rate": 4.931392102114081e-05,
"loss": 3.2969,
"step": 102000
},
{
"epoch": 20.442760271240527,
"grad_norm": 6.826496124267578,
"learning_rate": 4.9114479457518946e-05,
"loss": 3.315,
"step": 102500
},
{
"epoch": 20.542481053051457,
"grad_norm": 6.47593879699707,
"learning_rate": 4.8915037893897094e-05,
"loss": 3.2395,
"step": 103000
},
{
"epoch": 20.642201834862384,
"grad_norm": 6.942465782165527,
"learning_rate": 4.871559633027523e-05,
"loss": 3.2812,
"step": 103500
},
{
"epoch": 20.741922616673314,
"grad_norm": 6.694247722625732,
"learning_rate": 4.851615476665337e-05,
"loss": 3.2757,
"step": 104000
},
{
"epoch": 20.841643398484244,
"grad_norm": 6.374402046203613,
"learning_rate": 4.8316713203031514e-05,
"loss": 3.2517,
"step": 104500
},
{
"epoch": 20.941364180295174,
"grad_norm": 7.804276943206787,
"learning_rate": 4.8117271639409656e-05,
"loss": 3.2417,
"step": 105000
},
{
"epoch": 21.041084962106105,
"grad_norm": 7.735393524169922,
"learning_rate": 4.791822895891504e-05,
"loss": 3.2124,
"step": 105500
},
{
"epoch": 21.14080574391703,
"grad_norm": 6.500980377197266,
"learning_rate": 4.771878739529318e-05,
"loss": 3.1786,
"step": 106000
},
{
"epoch": 21.24052652572796,
"grad_norm": 6.206303119659424,
"learning_rate": 4.751934583167133e-05,
"loss": 3.188,
"step": 106500
},
{
"epoch": 21.340247307538892,
"grad_norm": 7.221670150756836,
"learning_rate": 4.731990426804946e-05,
"loss": 3.1658,
"step": 107000
},
{
"epoch": 21.439968089349822,
"grad_norm": 6.705102443695068,
"learning_rate": 4.7120462704427605e-05,
"loss": 3.1698,
"step": 107500
},
{
"epoch": 21.53968887116075,
"grad_norm": 7.459311485290527,
"learning_rate": 4.692102114080575e-05,
"loss": 3.1263,
"step": 108000
},
{
"epoch": 21.63940965297168,
"grad_norm": 6.276129245758057,
"learning_rate": 4.6721978460311135e-05,
"loss": 3.1438,
"step": 108500
},
{
"epoch": 21.73913043478261,
"grad_norm": 6.849742412567139,
"learning_rate": 4.652253689668927e-05,
"loss": 3.1688,
"step": 109000
},
{
"epoch": 21.83885121659354,
"grad_norm": 6.463535308837891,
"learning_rate": 4.632309533306741e-05,
"loss": 3.1261,
"step": 109500
},
{
"epoch": 21.938571998404466,
"grad_norm": 6.4734063148498535,
"learning_rate": 4.6123653769445554e-05,
"loss": 3.1375,
"step": 110000
},
{
"epoch": 22.038292780215397,
"grad_norm": 6.659780025482178,
"learning_rate": 4.5924212205823696e-05,
"loss": 3.1488,
"step": 110500
},
{
"epoch": 22.138013562026327,
"grad_norm": 6.0405402183532715,
"learning_rate": 4.572477064220184e-05,
"loss": 3.0816,
"step": 111000
},
{
"epoch": 22.237734343837257,
"grad_norm": 6.467530727386475,
"learning_rate": 4.5525329078579974e-05,
"loss": 3.0573,
"step": 111500
},
{
"epoch": 22.337455125648184,
"grad_norm": 7.352579116821289,
"learning_rate": 4.532588751495812e-05,
"loss": 3.054,
"step": 112000
},
{
"epoch": 22.437175907459114,
"grad_norm": 6.598001956939697,
"learning_rate": 4.5126844834463503e-05,
"loss": 3.0912,
"step": 112500
},
{
"epoch": 22.536896689270044,
"grad_norm": 7.065674304962158,
"learning_rate": 4.492780215396889e-05,
"loss": 3.0784,
"step": 113000
},
{
"epoch": 22.636617471080974,
"grad_norm": 6.545448303222656,
"learning_rate": 4.472836059034703e-05,
"loss": 3.0489,
"step": 113500
},
{
"epoch": 22.7363382528919,
"grad_norm": 6.2428059577941895,
"learning_rate": 4.4528919026725175e-05,
"loss": 3.0412,
"step": 114000
},
{
"epoch": 22.83605903470283,
"grad_norm": 6.4470367431640625,
"learning_rate": 4.432947746310331e-05,
"loss": 3.0359,
"step": 114500
},
{
"epoch": 22.93577981651376,
"grad_norm": 6.093207836151123,
"learning_rate": 4.413003589948145e-05,
"loss": 3.0304,
"step": 115000
},
{
"epoch": 23.035500598324692,
"grad_norm": 6.75270414352417,
"learning_rate": 4.3930594335859595e-05,
"loss": 3.02,
"step": 115500
},
{
"epoch": 23.13522138013562,
"grad_norm": 6.7165374755859375,
"learning_rate": 4.373115277223774e-05,
"loss": 3.0069,
"step": 116000
},
{
"epoch": 23.23494216194655,
"grad_norm": 5.961038589477539,
"learning_rate": 4.353171120861588e-05,
"loss": 2.9626,
"step": 116500
},
{
"epoch": 23.33466294375748,
"grad_norm": 6.657290935516357,
"learning_rate": 4.333266852812126e-05,
"loss": 2.9839,
"step": 117000
},
{
"epoch": 23.43438372556841,
"grad_norm": 6.603748798370361,
"learning_rate": 4.31332269644994e-05,
"loss": 2.984,
"step": 117500
},
{
"epoch": 23.53410450737934,
"grad_norm": 6.49187707901001,
"learning_rate": 4.2933785400877544e-05,
"loss": 2.9815,
"step": 118000
},
{
"epoch": 23.633825289190266,
"grad_norm": 7.08600378036499,
"learning_rate": 4.273474272038293e-05,
"loss": 2.9877,
"step": 118500
},
{
"epoch": 23.733546071001197,
"grad_norm": 6.5724077224731445,
"learning_rate": 4.253530115676107e-05,
"loss": 2.965,
"step": 119000
},
{
"epoch": 23.833266852812127,
"grad_norm": 6.058481693267822,
"learning_rate": 4.233585959313921e-05,
"loss": 2.9759,
"step": 119500
},
{
"epoch": 23.932987634623057,
"grad_norm": 7.042490482330322,
"learning_rate": 4.213641802951736e-05,
"loss": 2.9619,
"step": 120000
},
{
"epoch": 24.032708416433984,
"grad_norm": 6.764120578765869,
"learning_rate": 4.193697646589549e-05,
"loss": 2.9482,
"step": 120500
},
{
"epoch": 24.132429198244914,
"grad_norm": 6.224752426147461,
"learning_rate": 4.1737534902273635e-05,
"loss": 2.9368,
"step": 121000
},
{
"epoch": 24.232149980055844,
"grad_norm": 6.817770481109619,
"learning_rate": 4.153809333865178e-05,
"loss": 2.9325,
"step": 121500
},
{
"epoch": 24.331870761866774,
"grad_norm": 6.26372766494751,
"learning_rate": 4.133865177502992e-05,
"loss": 2.8953,
"step": 122000
},
{
"epoch": 24.4315915436777,
"grad_norm": 7.136674880981445,
"learning_rate": 4.11396090945353e-05,
"loss": 2.9019,
"step": 122500
},
{
"epoch": 24.53131232548863,
"grad_norm": 6.46077299118042,
"learning_rate": 4.094016753091344e-05,
"loss": 2.9091,
"step": 123000
},
{
"epoch": 24.63103310729956,
"grad_norm": 6.0465288162231445,
"learning_rate": 4.0740725967291584e-05,
"loss": 2.9107,
"step": 123500
},
{
"epoch": 24.730753889110492,
"grad_norm": 6.354468822479248,
"learning_rate": 4.0541284403669726e-05,
"loss": 2.9206,
"step": 124000
},
{
"epoch": 24.83047467092142,
"grad_norm": 6.679784297943115,
"learning_rate": 4.0342241723175114e-05,
"loss": 2.8901,
"step": 124500
},
{
"epoch": 24.93019545273235,
"grad_norm": 6.418820858001709,
"learning_rate": 4.014280015955325e-05,
"loss": 2.8971,
"step": 125000
},
{
"epoch": 25.0,
"step": 125350,
"total_flos": 1.319930762508864e+17,
"train_loss": 0.6150265672133651,
"train_runtime": 7682.6614,
"train_samples_per_second": 261.039,
"train_steps_per_second": 16.316
},
{
"epoch": 25.02991623454328,
"grad_norm": 6.8266754150390625,
"learning_rate": 4.9950139609094536e-05,
"loss": 2.9041,
"step": 125500
},
{
"epoch": 25.12963701635421,
"grad_norm": 7.047895431518555,
"learning_rate": 4.9783938306076325e-05,
"loss": 2.9501,
"step": 126000
},
{
"epoch": 25.229357798165136,
"grad_norm": 6.489243507385254,
"learning_rate": 4.9617737003058106e-05,
"loss": 2.9795,
"step": 126500
},
{
"epoch": 25.329078579976066,
"grad_norm": 6.933114528656006,
"learning_rate": 4.9451535700039895e-05,
"loss": 2.9906,
"step": 127000
},
{
"epoch": 25.428799361786997,
"grad_norm": 7.721564769744873,
"learning_rate": 4.9285334397021676e-05,
"loss": 2.9822,
"step": 127500
},
{
"epoch": 25.528520143597927,
"grad_norm": 7.604334831237793,
"learning_rate": 4.911913309400346e-05,
"loss": 2.9751,
"step": 128000
},
{
"epoch": 25.628240925408853,
"grad_norm": 6.689730644226074,
"learning_rate": 4.8952931790985246e-05,
"loss": 2.9806,
"step": 128500
},
{
"epoch": 25.727961707219784,
"grad_norm": 7.001711368560791,
"learning_rate": 4.878673048796703e-05,
"loss": 2.9701,
"step": 129000
},
{
"epoch": 25.827682489030714,
"grad_norm": 6.627374649047852,
"learning_rate": 4.862052918494881e-05,
"loss": 2.982,
"step": 129500
},
{
"epoch": 25.927403270841644,
"grad_norm": 6.500030517578125,
"learning_rate": 4.8454660284536635e-05,
"loss": 2.9497,
"step": 130000
},
{
"epoch": 26.027124052652574,
"grad_norm": 6.908927917480469,
"learning_rate": 4.828845898151842e-05,
"loss": 2.9201,
"step": 130500
},
{
"epoch": 26.1268448344635,
"grad_norm": 7.953597068786621,
"learning_rate": 4.8122257678500205e-05,
"loss": 2.8916,
"step": 131000
},
{
"epoch": 26.22656561627443,
"grad_norm": 7.111712455749512,
"learning_rate": 4.795605637548199e-05,
"loss": 2.8983,
"step": 131500
},
{
"epoch": 26.32628639808536,
"grad_norm": 7.099549293518066,
"learning_rate": 4.778985507246377e-05,
"loss": 2.8862,
"step": 132000
},
{
"epoch": 26.426007179896292,
"grad_norm": 6.708031177520752,
"learning_rate": 4.762365376944555e-05,
"loss": 2.8828,
"step": 132500
},
{
"epoch": 26.52572796170722,
"grad_norm": 6.638050079345703,
"learning_rate": 4.745745246642734e-05,
"loss": 2.9,
"step": 133000
},
{
"epoch": 26.62544874351815,
"grad_norm": 6.474231243133545,
"learning_rate": 4.729125116340912e-05,
"loss": 2.8729,
"step": 133500
},
{
"epoch": 26.72516952532908,
"grad_norm": 7.071346759796143,
"learning_rate": 4.712538226299694e-05,
"loss": 2.878,
"step": 134000
},
{
"epoch": 26.82489030714001,
"grad_norm": 7.4629740715026855,
"learning_rate": 4.695918095997873e-05,
"loss": 2.8949,
"step": 134500
},
{
"epoch": 26.924611088950936,
"grad_norm": 7.166282653808594,
"learning_rate": 4.679331205956655e-05,
"loss": 2.8834,
"step": 135000
},
{
"epoch": 27.024331870761866,
"grad_norm": 7.213958263397217,
"learning_rate": 4.6627110756548334e-05,
"loss": 2.8722,
"step": 135500
},
{
"epoch": 27.124052652572797,
"grad_norm": 6.917830467224121,
"learning_rate": 4.6460909453530116e-05,
"loss": 2.812,
"step": 136000
},
{
"epoch": 27.223773434383727,
"grad_norm": 7.030029296875,
"learning_rate": 4.62947081505119e-05,
"loss": 2.7973,
"step": 136500
},
{
"epoch": 27.323494216194653,
"grad_norm": 6.927401542663574,
"learning_rate": 4.6128506847493686e-05,
"loss": 2.8567,
"step": 137000
},
{
"epoch": 27.423214998005584,
"grad_norm": 7.063901424407959,
"learning_rate": 4.596230554447547e-05,
"loss": 2.8119,
"step": 137500
},
{
"epoch": 27.522935779816514,
"grad_norm": 6.619449138641357,
"learning_rate": 4.5796104241457256e-05,
"loss": 2.814,
"step": 138000
},
{
"epoch": 27.622656561627444,
"grad_norm": 6.861698150634766,
"learning_rate": 4.562990293843904e-05,
"loss": 2.7966,
"step": 138500
},
{
"epoch": 27.72237734343837,
"grad_norm": 5.698707580566406,
"learning_rate": 4.5464034038026856e-05,
"loss": 2.8274,
"step": 139000
},
{
"epoch": 27.8220981252493,
"grad_norm": 6.638801574707031,
"learning_rate": 4.5297832735008645e-05,
"loss": 2.8111,
"step": 139500
},
{
"epoch": 27.92181890706023,
"grad_norm": 7.414352893829346,
"learning_rate": 4.5131631431990427e-05,
"loss": 2.8219,
"step": 140000
},
{
"epoch": 28.02153968887116,
"grad_norm": 7.000102519989014,
"learning_rate": 4.4965430128972215e-05,
"loss": 2.8059,
"step": 140500
},
{
"epoch": 28.121260470682092,
"grad_norm": 7.648940563201904,
"learning_rate": 4.4799561228560034e-05,
"loss": 2.7801,
"step": 141000
},
{
"epoch": 28.22098125249302,
"grad_norm": 6.238720417022705,
"learning_rate": 4.4633359925541815e-05,
"loss": 2.7611,
"step": 141500
},
{
"epoch": 28.32070203430395,
"grad_norm": 7.083422660827637,
"learning_rate": 4.4467491025129634e-05,
"loss": 2.7476,
"step": 142000
},
{
"epoch": 28.42042281611488,
"grad_norm": 7.1048760414123535,
"learning_rate": 4.430128972211142e-05,
"loss": 2.7601,
"step": 142500
},
{
"epoch": 28.52014359792581,
"grad_norm": 6.950742244720459,
"learning_rate": 4.4135088419093204e-05,
"loss": 2.7615,
"step": 143000
},
{
"epoch": 28.619864379736736,
"grad_norm": 7.063054084777832,
"learning_rate": 4.396888711607499e-05,
"loss": 2.7583,
"step": 143500
},
{
"epoch": 28.719585161547666,
"grad_norm": 6.951484680175781,
"learning_rate": 4.3802685813056774e-05,
"loss": 2.748,
"step": 144000
},
{
"epoch": 28.819305943358597,
"grad_norm": 7.212677955627441,
"learning_rate": 4.363648451003856e-05,
"loss": 2.7542,
"step": 144500
},
{
"epoch": 28.919026725169527,
"grad_norm": 6.691658973693848,
"learning_rate": 4.3470283207020344e-05,
"loss": 2.753,
"step": 145000
},
{
"epoch": 29.018747506980453,
"grad_norm": 7.1954874992370605,
"learning_rate": 4.330408190400213e-05,
"loss": 2.7332,
"step": 145500
},
{
"epoch": 29.118468288791384,
"grad_norm": 6.654098987579346,
"learning_rate": 4.313821300358995e-05,
"loss": 2.7109,
"step": 146000
},
{
"epoch": 29.218189070602314,
"grad_norm": 6.924403667449951,
"learning_rate": 4.297201170057173e-05,
"loss": 2.7076,
"step": 146500
},
{
"epoch": 29.317909852413244,
"grad_norm": 7.731849193572998,
"learning_rate": 4.280581039755352e-05,
"loss": 2.6943,
"step": 147000
},
{
"epoch": 29.41763063422417,
"grad_norm": 7.095526218414307,
"learning_rate": 4.26396090945353e-05,
"loss": 2.72,
"step": 147500
},
{
"epoch": 29.5173514160351,
"grad_norm": 7.1939520835876465,
"learning_rate": 4.247340779151709e-05,
"loss": 2.6772,
"step": 148000
},
{
"epoch": 29.61707219784603,
"grad_norm": 7.466503620147705,
"learning_rate": 4.230753889110491e-05,
"loss": 2.7193,
"step": 148500
},
{
"epoch": 29.71679297965696,
"grad_norm": 6.902263164520264,
"learning_rate": 4.214133758808669e-05,
"loss": 2.716,
"step": 149000
},
{
"epoch": 29.81651376146789,
"grad_norm": 7.366625785827637,
"learning_rate": 4.197513628506848e-05,
"loss": 2.7009,
"step": 149500
},
{
"epoch": 29.91623454327882,
"grad_norm": 6.991941452026367,
"learning_rate": 4.180893498205026e-05,
"loss": 2.7202,
"step": 150000
},
{
"epoch": 30.0,
"step": 150420,
"total_flos": 1.5839169150106368e+17,
"train_loss": 0.47119966579742084,
"train_runtime": 6930.0607,
"train_samples_per_second": 347.265,
"train_steps_per_second": 21.705
},
{
"epoch": 30.01595532508975,
"grad_norm": 6.077478885650635,
"learning_rate": 4.997720667844322e-05,
"loss": 2.7286,
"step": 150500
},
{
"epoch": 30.11567610690068,
"grad_norm": 6.566033363342285,
"learning_rate": 4.983474841871332e-05,
"loss": 2.7319,
"step": 151000
},
{
"epoch": 30.215396888711606,
"grad_norm": 7.486234188079834,
"learning_rate": 4.969229015898342e-05,
"loss": 2.7899,
"step": 151500
},
{
"epoch": 30.315117670522536,
"grad_norm": 7.640929222106934,
"learning_rate": 4.954983189925352e-05,
"loss": 2.7598,
"step": 152000
},
{
"epoch": 30.414838452333466,
"grad_norm": 7.036547660827637,
"learning_rate": 4.940737363952362e-05,
"loss": 2.754,
"step": 152500
},
{
"epoch": 30.514559234144397,
"grad_norm": 7.128058910369873,
"learning_rate": 4.926491537979372e-05,
"loss": 2.7888,
"step": 153000
},
{
"epoch": 30.614280015955327,
"grad_norm": 7.1788249015808105,
"learning_rate": 4.912245712006382e-05,
"loss": 2.7662,
"step": 153500
},
{
"epoch": 30.714000797766253,
"grad_norm": 7.081215858459473,
"learning_rate": 4.897999886033392e-05,
"loss": 2.7722,
"step": 154000
},
{
"epoch": 30.813721579577184,
"grad_norm": 6.131695747375488,
"learning_rate": 4.883754060060402e-05,
"loss": 2.7464,
"step": 154500
},
{
"epoch": 30.913442361388114,
"grad_norm": 6.66817569732666,
"learning_rate": 4.869508234087412e-05,
"loss": 2.7352,
"step": 155000
},
{
"epoch": 31.013163143199044,
"grad_norm": 7.4430952072143555,
"learning_rate": 4.8552908997663685e-05,
"loss": 2.7503,
"step": 155500
},
{
"epoch": 31.11288392500997,
"grad_norm": 7.984841346740723,
"learning_rate": 4.8410450737933786e-05,
"loss": 2.6821,
"step": 156000
},
{
"epoch": 31.2126047068209,
"grad_norm": 7.386984348297119,
"learning_rate": 4.8267992478203886e-05,
"loss": 2.6916,
"step": 156500
},
{
"epoch": 31.31232548863183,
"grad_norm": 6.3857951164245605,
"learning_rate": 4.8125534218473987e-05,
"loss": 2.6826,
"step": 157000
},
{
"epoch": 31.41204627044276,
"grad_norm": 7.394888401031494,
"learning_rate": 4.798307595874409e-05,
"loss": 2.7099,
"step": 157500
},
{
"epoch": 31.51176705225369,
"grad_norm": 7.39955997467041,
"learning_rate": 4.784061769901419e-05,
"loss": 2.7056,
"step": 158000
},
{
"epoch": 31.61148783406462,
"grad_norm": 6.624033451080322,
"learning_rate": 4.769844435580375e-05,
"loss": 2.6903,
"step": 158500
},
{
"epoch": 31.71120861587555,
"grad_norm": 6.656693458557129,
"learning_rate": 4.755627101259331e-05,
"loss": 2.6877,
"step": 159000
},
{
"epoch": 31.81092939768648,
"grad_norm": 7.474542140960693,
"learning_rate": 4.741381275286341e-05,
"loss": 2.6965,
"step": 159500
},
{
"epoch": 31.910650179497406,
"grad_norm": 7.388774394989014,
"learning_rate": 4.727135449313351e-05,
"loss": 2.7145,
"step": 160000
},
{
"epoch": 32.01037096130834,
"grad_norm": 7.423541069030762,
"learning_rate": 4.712889623340361e-05,
"loss": 2.6943,
"step": 160500
},
{
"epoch": 32.11009174311926,
"grad_norm": 6.063508033752441,
"learning_rate": 4.698643797367371e-05,
"loss": 2.6214,
"step": 161000
},
{
"epoch": 32.20981252493019,
"grad_norm": 7.619082450866699,
"learning_rate": 4.6843979713943814e-05,
"loss": 2.6318,
"step": 161500
},
{
"epoch": 32.30953330674112,
"grad_norm": 6.978066921234131,
"learning_rate": 4.670152145421392e-05,
"loss": 2.6327,
"step": 162000
},
{
"epoch": 32.40925408855205,
"grad_norm": 6.166346073150635,
"learning_rate": 4.655906319448402e-05,
"loss": 2.6419,
"step": 162500
},
{
"epoch": 32.508974870362984,
"grad_norm": 7.364738464355469,
"learning_rate": 4.641660493475412e-05,
"loss": 2.6356,
"step": 163000
},
{
"epoch": 32.608695652173914,
"grad_norm": 7.476531982421875,
"learning_rate": 4.627414667502422e-05,
"loss": 2.6344,
"step": 163500
},
{
"epoch": 32.708416433984844,
"grad_norm": 7.627068042755127,
"learning_rate": 4.613168841529432e-05,
"loss": 2.6434,
"step": 164000
},
{
"epoch": 32.808137215795774,
"grad_norm": 7.334908962249756,
"learning_rate": 4.598923015556442e-05,
"loss": 2.663,
"step": 164500
},
{
"epoch": 32.907857997606705,
"grad_norm": 6.580120086669922,
"learning_rate": 4.5847341728873446e-05,
"loss": 2.6406,
"step": 165000
},
{
"epoch": 33.00757877941763,
"grad_norm": 6.953055381774902,
"learning_rate": 4.570488346914355e-05,
"loss": 2.6517,
"step": 165500
},
{
"epoch": 33.10729956122856,
"grad_norm": 6.980926036834717,
"learning_rate": 4.556242520941365e-05,
"loss": 2.589,
"step": 166000
},
{
"epoch": 33.20702034303949,
"grad_norm": 7.215412616729736,
"learning_rate": 4.541996694968375e-05,
"loss": 2.5831,
"step": 166500
},
{
"epoch": 33.30674112485042,
"grad_norm": 7.203444004058838,
"learning_rate": 4.527750868995385e-05,
"loss": 2.5739,
"step": 167000
},
{
"epoch": 33.40646190666135,
"grad_norm": 5.696502685546875,
"learning_rate": 4.513505043022395e-05,
"loss": 2.604,
"step": 167500
},
{
"epoch": 33.50618268847228,
"grad_norm": 6.160342216491699,
"learning_rate": 4.499259217049405e-05,
"loss": 2.5848,
"step": 168000
},
{
"epoch": 33.60590347028321,
"grad_norm": 6.758869171142578,
"learning_rate": 4.485013391076415e-05,
"loss": 2.6157,
"step": 168500
},
{
"epoch": 33.70562425209414,
"grad_norm": 7.064002513885498,
"learning_rate": 4.4708245484073166e-05,
"loss": 2.5765,
"step": 169000
},
{
"epoch": 33.80534503390506,
"grad_norm": 7.993391513824463,
"learning_rate": 4.4565787224343267e-05,
"loss": 2.6115,
"step": 169500
},
{
"epoch": 33.90506581571599,
"grad_norm": 7.196022033691406,
"learning_rate": 4.442332896461337e-05,
"loss": 2.591,
"step": 170000
},
{
"epoch": 34.00478659752692,
"grad_norm": 8.118667602539062,
"learning_rate": 4.428115562140293e-05,
"loss": 2.5833,
"step": 170500
},
{
"epoch": 34.10450737933785,
"grad_norm": 7.465199947357178,
"learning_rate": 4.413869736167303e-05,
"loss": 2.5509,
"step": 171000
},
{
"epoch": 34.204228161148784,
"grad_norm": 6.739304542541504,
"learning_rate": 4.399623910194313e-05,
"loss": 2.5357,
"step": 171500
},
{
"epoch": 34.303948942959714,
"grad_norm": 6.758444786071777,
"learning_rate": 4.385378084221323e-05,
"loss": 2.567,
"step": 172000
},
{
"epoch": 34.403669724770644,
"grad_norm": 6.511049270629883,
"learning_rate": 4.371132258248333e-05,
"loss": 2.5759,
"step": 172500
},
{
"epoch": 34.503390506581574,
"grad_norm": 7.730967044830322,
"learning_rate": 4.356886432275343e-05,
"loss": 2.5494,
"step": 173000
},
{
"epoch": 34.6031112883925,
"grad_norm": 6.543623924255371,
"learning_rate": 4.342640606302353e-05,
"loss": 2.5482,
"step": 173500
},
{
"epoch": 34.70283207020343,
"grad_norm": 7.216828346252441,
"learning_rate": 4.328394780329364e-05,
"loss": 2.5593,
"step": 174000
},
{
"epoch": 34.80255285201436,
"grad_norm": 6.891706943511963,
"learning_rate": 4.3141774460083194e-05,
"loss": 2.5409,
"step": 174500
},
{
"epoch": 34.90227363382529,
"grad_norm": 7.4927778244018555,
"learning_rate": 4.29993162003533e-05,
"loss": 2.5673,
"step": 175000
},
{
"epoch": 35.0,
"step": 175490,
"total_flos": 1.8479030675124096e+17,
"train_loss": 0.37831091759340585,
"train_runtime": 6392.496,
"train_samples_per_second": 439.213,
"train_steps_per_second": 27.453
}
],
"logging_steps": 500,
"max_steps": 175490,
"num_input_tokens_seen": 0,
"num_train_epochs": 35,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.8479030675124096e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}