opt-babylm2-subset-default-3e-4 / trainer_state.json
kanishka's picture
End of training
1c2c453 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 141690,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.07057661091114405,
"grad_norm": 0.8544681072235107,
"learning_rate": 9.375e-06,
"loss": 5.8075,
"step": 1000
},
{
"epoch": 0.1411532218222881,
"grad_norm": 0.9348427057266235,
"learning_rate": 1.875e-05,
"loss": 3.9507,
"step": 2000
},
{
"epoch": 0.21172983273343213,
"grad_norm": 0.9987612366676331,
"learning_rate": 2.8125e-05,
"loss": 3.6154,
"step": 3000
},
{
"epoch": 0.2823064436445762,
"grad_norm": 0.9734600782394409,
"learning_rate": 3.75e-05,
"loss": 3.444,
"step": 4000
},
{
"epoch": 0.3528830545557202,
"grad_norm": 0.9963154196739197,
"learning_rate": 4.6874999999999994e-05,
"loss": 3.2983,
"step": 5000
},
{
"epoch": 0.42345966546686425,
"grad_norm": 0.8833392858505249,
"learning_rate": 5.625e-05,
"loss": 3.1806,
"step": 6000
},
{
"epoch": 0.49403627637800834,
"grad_norm": 0.8395134806632996,
"learning_rate": 6.5625e-05,
"loss": 3.0762,
"step": 7000
},
{
"epoch": 0.5646128872891524,
"grad_norm": 0.9056613445281982,
"learning_rate": 7.5e-05,
"loss": 3.0103,
"step": 8000
},
{
"epoch": 0.6351894982002965,
"grad_norm": 0.8697331547737122,
"learning_rate": 8.437499999999999e-05,
"loss": 2.9146,
"step": 9000
},
{
"epoch": 0.7057661091114404,
"grad_norm": 0.7743774056434631,
"learning_rate": 9.374999999999999e-05,
"loss": 2.8472,
"step": 10000
},
{
"epoch": 0.7763427200225845,
"grad_norm": 0.769959568977356,
"learning_rate": 0.00010312499999999999,
"loss": 2.8093,
"step": 11000
},
{
"epoch": 0.8469193309337285,
"grad_norm": 0.7926977872848511,
"learning_rate": 0.0001125,
"loss": 2.7498,
"step": 12000
},
{
"epoch": 0.9174959418448726,
"grad_norm": 0.7256486415863037,
"learning_rate": 0.000121865625,
"loss": 2.714,
"step": 13000
},
{
"epoch": 0.9880725527560167,
"grad_norm": 0.7287374138832092,
"learning_rate": 0.000131240625,
"loss": 2.6575,
"step": 14000
},
{
"epoch": 1.0,
"eval_accuracy": 0.4749594473317534,
"eval_loss": 2.8570854663848877,
"eval_runtime": 123.5266,
"eval_samples_per_second": 380.088,
"eval_steps_per_second": 5.942,
"step": 14169
},
{
"epoch": 1.0586491636671607,
"grad_norm": 0.7397704124450684,
"learning_rate": 0.00014060625,
"loss": 2.6238,
"step": 15000
},
{
"epoch": 1.1292257745783048,
"grad_norm": 0.7099004983901978,
"learning_rate": 0.000149971875,
"loss": 2.5914,
"step": 16000
},
{
"epoch": 1.1998023854894488,
"grad_norm": 0.6785891056060791,
"learning_rate": 0.000159346875,
"loss": 2.5765,
"step": 17000
},
{
"epoch": 1.2703789964005927,
"grad_norm": 0.6459276080131531,
"learning_rate": 0.000168703125,
"loss": 2.5484,
"step": 18000
},
{
"epoch": 1.340955607311737,
"grad_norm": 0.6296180486679077,
"learning_rate": 0.000178078125,
"loss": 2.5329,
"step": 19000
},
{
"epoch": 1.4115322182228809,
"grad_norm": 0.648757815361023,
"learning_rate": 0.00018745312499999998,
"loss": 2.5078,
"step": 20000
},
{
"epoch": 1.482108829134025,
"grad_norm": 0.6126940250396729,
"learning_rate": 0.00019681874999999998,
"loss": 2.5066,
"step": 21000
},
{
"epoch": 1.552685440045169,
"grad_norm": 0.5499350428581238,
"learning_rate": 0.00020618437499999995,
"loss": 2.4882,
"step": 22000
},
{
"epoch": 1.623262050956313,
"grad_norm": 0.7012745141983032,
"learning_rate": 0.00021555937499999998,
"loss": 2.4746,
"step": 23000
},
{
"epoch": 1.6938386618674572,
"grad_norm": 0.563831627368927,
"learning_rate": 0.00022493437499999998,
"loss": 2.4607,
"step": 24000
},
{
"epoch": 1.764415272778601,
"grad_norm": 0.4928041696548462,
"learning_rate": 0.00023430937499999997,
"loss": 2.4454,
"step": 25000
},
{
"epoch": 1.8349918836897452,
"grad_norm": 0.5389479398727417,
"learning_rate": 0.00024367499999999997,
"loss": 2.4429,
"step": 26000
},
{
"epoch": 1.9055684946008893,
"grad_norm": 0.549089252948761,
"learning_rate": 0.000253040625,
"loss": 2.4239,
"step": 27000
},
{
"epoch": 1.9761451055120332,
"grad_norm": 0.5027530193328857,
"learning_rate": 0.000262415625,
"loss": 2.4179,
"step": 28000
},
{
"epoch": 2.0,
"eval_accuracy": 0.4989820084802377,
"eval_loss": 2.6257801055908203,
"eval_runtime": 124.6794,
"eval_samples_per_second": 376.574,
"eval_steps_per_second": 5.887,
"step": 28338
},
{
"epoch": 2.0467217164231775,
"grad_norm": 0.4511743485927582,
"learning_rate": 0.000271790625,
"loss": 2.3885,
"step": 29000
},
{
"epoch": 2.1172983273343213,
"grad_norm": 0.4871021807193756,
"learning_rate": 0.00028115624999999994,
"loss": 2.378,
"step": 30000
},
{
"epoch": 2.1878749382454656,
"grad_norm": 0.4743562638759613,
"learning_rate": 0.00029053124999999994,
"loss": 2.3721,
"step": 31000
},
{
"epoch": 2.2584515491566095,
"grad_norm": 0.4668987989425659,
"learning_rate": 0.00029990624999999993,
"loss": 2.3584,
"step": 32000
},
{
"epoch": 2.3290281600677534,
"grad_norm": 0.4822200834751129,
"learning_rate": 0.0002972951043850852,
"loss": 2.3592,
"step": 33000
},
{
"epoch": 2.3996047709788977,
"grad_norm": 0.43820998072624207,
"learning_rate": 0.0002945601239857781,
"loss": 2.3459,
"step": 34000
},
{
"epoch": 2.4701813818900415,
"grad_norm": 0.4586232602596283,
"learning_rate": 0.0002918251435864709,
"loss": 2.3499,
"step": 35000
},
{
"epoch": 2.5407579928011854,
"grad_norm": 0.4139242172241211,
"learning_rate": 0.0002890928981675631,
"loss": 2.3302,
"step": 36000
},
{
"epoch": 2.6113346037123297,
"grad_norm": 0.41075077652931213,
"learning_rate": 0.000286357917768256,
"loss": 2.3236,
"step": 37000
},
{
"epoch": 2.681911214623474,
"grad_norm": 0.4315313994884491,
"learning_rate": 0.00028362567234934815,
"loss": 2.3239,
"step": 38000
},
{
"epoch": 2.752487825534618,
"grad_norm": 0.39671897888183594,
"learning_rate": 0.000280890691950041,
"loss": 2.3087,
"step": 39000
},
{
"epoch": 2.8230644364457618,
"grad_norm": 0.4033380150794983,
"learning_rate": 0.00027815844653113317,
"loss": 2.2952,
"step": 40000
},
{
"epoch": 2.893641047356906,
"grad_norm": 0.36479175090789795,
"learning_rate": 0.000275423466131826,
"loss": 2.3049,
"step": 41000
},
{
"epoch": 2.96421765826805,
"grad_norm": 0.3533291220664978,
"learning_rate": 0.00027268848573251887,
"loss": 2.286,
"step": 42000
},
{
"epoch": 3.0,
"eval_accuracy": 0.5122970740171453,
"eval_loss": 2.5088469982147217,
"eval_runtime": 124.5088,
"eval_samples_per_second": 377.09,
"eval_steps_per_second": 5.895,
"step": 42507
},
{
"epoch": 3.034794269179194,
"grad_norm": 0.37031927704811096,
"learning_rate": 0.00026995350533321177,
"loss": 2.2529,
"step": 43000
},
{
"epoch": 3.105370880090338,
"grad_norm": 0.39132222533226013,
"learning_rate": 0.0002672185249339046,
"loss": 2.2252,
"step": 44000
},
{
"epoch": 3.175947491001482,
"grad_norm": 0.36771416664123535,
"learning_rate": 0.0002644835445345975,
"loss": 2.2389,
"step": 45000
},
{
"epoch": 3.2465241019126263,
"grad_norm": 0.40541785955429077,
"learning_rate": 0.00026175129911568964,
"loss": 2.2224,
"step": 46000
},
{
"epoch": 3.31710071282377,
"grad_norm": 0.4325103461742401,
"learning_rate": 0.0002590190536967818,
"loss": 2.2286,
"step": 47000
},
{
"epoch": 3.3876773237349145,
"grad_norm": 0.3990887403488159,
"learning_rate": 0.00025628407329747466,
"loss": 2.216,
"step": 48000
},
{
"epoch": 3.4582539346460583,
"grad_norm": 0.36805373430252075,
"learning_rate": 0.00025354909289816756,
"loss": 2.2209,
"step": 49000
},
{
"epoch": 3.528830545557202,
"grad_norm": 0.357721209526062,
"learning_rate": 0.0002508141124988604,
"loss": 2.2218,
"step": 50000
},
{
"epoch": 3.5994071564683465,
"grad_norm": 0.3424566984176636,
"learning_rate": 0.00024808460206035186,
"loss": 2.2159,
"step": 51000
},
{
"epoch": 3.6699837673794904,
"grad_norm": 0.35358792543411255,
"learning_rate": 0.00024534962166104476,
"loss": 2.2085,
"step": 52000
},
{
"epoch": 3.7405603782906347,
"grad_norm": 0.369150310754776,
"learning_rate": 0.0002426146412617376,
"loss": 2.2033,
"step": 53000
},
{
"epoch": 3.8111369892017786,
"grad_norm": 0.3341532051563263,
"learning_rate": 0.00023987966086243048,
"loss": 2.2017,
"step": 54000
},
{
"epoch": 3.8817136001129224,
"grad_norm": 0.40089789032936096,
"learning_rate": 0.00023714741544352263,
"loss": 2.2015,
"step": 55000
},
{
"epoch": 3.9522902110240667,
"grad_norm": 0.35854268074035645,
"learning_rate": 0.0002344124350442155,
"loss": 2.2124,
"step": 56000
},
{
"epoch": 4.0,
"eval_accuracy": 0.5203132962446899,
"eval_loss": 2.444835662841797,
"eval_runtime": 124.4717,
"eval_samples_per_second": 377.202,
"eval_steps_per_second": 5.897,
"step": 56676
},
{
"epoch": 4.022866821935211,
"grad_norm": 0.3867768347263336,
"learning_rate": 0.00023167745464490835,
"loss": 2.1839,
"step": 57000
},
{
"epoch": 4.093443432846355,
"grad_norm": 0.3627295196056366,
"learning_rate": 0.00022894520922600055,
"loss": 2.1384,
"step": 58000
},
{
"epoch": 4.164020043757499,
"grad_norm": 0.35046979784965515,
"learning_rate": 0.00022621022882669337,
"loss": 2.1439,
"step": 59000
},
{
"epoch": 4.234596654668643,
"grad_norm": 0.3317088186740875,
"learning_rate": 0.00022347524842738624,
"loss": 2.1446,
"step": 60000
},
{
"epoch": 4.3051732655797865,
"grad_norm": 0.37563446164131165,
"learning_rate": 0.00022074300300847845,
"loss": 2.1408,
"step": 61000
},
{
"epoch": 4.375749876490931,
"grad_norm": 0.36360374093055725,
"learning_rate": 0.00021800802260917127,
"loss": 2.1447,
"step": 62000
},
{
"epoch": 4.446326487402075,
"grad_norm": 0.35474300384521484,
"learning_rate": 0.00021527577719026347,
"loss": 2.1478,
"step": 63000
},
{
"epoch": 4.516903098313219,
"grad_norm": 0.38771218061447144,
"learning_rate": 0.00021254079679095632,
"loss": 2.136,
"step": 64000
},
{
"epoch": 4.587479709224363,
"grad_norm": 0.3860458433628082,
"learning_rate": 0.0002098085513720485,
"loss": 2.1379,
"step": 65000
},
{
"epoch": 4.658056320135507,
"grad_norm": 0.3763484060764313,
"learning_rate": 0.00020707357097274134,
"loss": 2.1344,
"step": 66000
},
{
"epoch": 4.7286329310466515,
"grad_norm": 0.37907466292381287,
"learning_rate": 0.0002043413255538335,
"loss": 2.1426,
"step": 67000
},
{
"epoch": 4.799209541957795,
"grad_norm": 0.34690865874290466,
"learning_rate": 0.00020160634515452636,
"loss": 2.1208,
"step": 68000
},
{
"epoch": 4.869786152868939,
"grad_norm": 0.36183568835258484,
"learning_rate": 0.00019887409973561856,
"loss": 2.1242,
"step": 69000
},
{
"epoch": 4.940362763780083,
"grad_norm": 0.35947293043136597,
"learning_rate": 0.00019613911933631138,
"loss": 2.1307,
"step": 70000
},
{
"epoch": 5.0,
"eval_accuracy": 0.5251367702083976,
"eval_loss": 2.4099230766296387,
"eval_runtime": 124.5048,
"eval_samples_per_second": 377.102,
"eval_steps_per_second": 5.895,
"step": 70845
},
{
"epoch": 5.010939374691227,
"grad_norm": 0.3575204312801361,
"learning_rate": 0.00019340687391740358,
"loss": 2.1265,
"step": 71000
},
{
"epoch": 5.081515985602372,
"grad_norm": 0.38784071803092957,
"learning_rate": 0.00019067189351809646,
"loss": 2.064,
"step": 72000
},
{
"epoch": 5.152092596513516,
"grad_norm": 0.34589263796806335,
"learning_rate": 0.0001879396480991886,
"loss": 2.0754,
"step": 73000
},
{
"epoch": 5.2226692074246595,
"grad_norm": 0.3403594195842743,
"learning_rate": 0.00018520466769988148,
"loss": 2.073,
"step": 74000
},
{
"epoch": 5.293245818335803,
"grad_norm": 0.38706710934638977,
"learning_rate": 0.00018246968730057433,
"loss": 2.0812,
"step": 75000
},
{
"epoch": 5.363822429246947,
"grad_norm": 0.39309191703796387,
"learning_rate": 0.0001797347069012672,
"loss": 2.0915,
"step": 76000
},
{
"epoch": 5.434399040158092,
"grad_norm": 0.37432822585105896,
"learning_rate": 0.00017700246148235935,
"loss": 2.0755,
"step": 77000
},
{
"epoch": 5.504975651069236,
"grad_norm": 0.3538018465042114,
"learning_rate": 0.00017426748108305222,
"loss": 2.0772,
"step": 78000
},
{
"epoch": 5.57555226198038,
"grad_norm": 0.3601301610469818,
"learning_rate": 0.00017153523566414437,
"loss": 2.085,
"step": 79000
},
{
"epoch": 5.6461288728915235,
"grad_norm": 0.3469400703907013,
"learning_rate": 0.00016880299024523657,
"loss": 2.078,
"step": 80000
},
{
"epoch": 5.716705483802667,
"grad_norm": 0.3623177111148834,
"learning_rate": 0.00016607074482632872,
"loss": 2.077,
"step": 81000
},
{
"epoch": 5.787282094713812,
"grad_norm": 0.3382331132888794,
"learning_rate": 0.0001633357644270216,
"loss": 2.0745,
"step": 82000
},
{
"epoch": 5.857858705624956,
"grad_norm": 0.3787217140197754,
"learning_rate": 0.00016060078402771447,
"loss": 2.0768,
"step": 83000
},
{
"epoch": 5.9284353165361,
"grad_norm": 0.36773914098739624,
"learning_rate": 0.00015786580362840732,
"loss": 2.0756,
"step": 84000
},
{
"epoch": 5.999011927447244,
"grad_norm": 0.39036858081817627,
"learning_rate": 0.00015513082322910016,
"loss": 2.0706,
"step": 85000
},
{
"epoch": 6.0,
"eval_accuracy": 0.5280841264512295,
"eval_loss": 2.388700246810913,
"eval_runtime": 125.3022,
"eval_samples_per_second": 374.702,
"eval_steps_per_second": 5.858,
"step": 85014
},
{
"epoch": 6.069588538358388,
"grad_norm": 0.3616304099559784,
"learning_rate": 0.00015239584282979304,
"loss": 2.0123,
"step": 86000
},
{
"epoch": 6.140165149269532,
"grad_norm": 0.37002402544021606,
"learning_rate": 0.0001496635974108852,
"loss": 2.0153,
"step": 87000
},
{
"epoch": 6.210741760180676,
"grad_norm": 0.3603184223175049,
"learning_rate": 0.0001469286170115781,
"loss": 2.0199,
"step": 88000
},
{
"epoch": 6.28131837109182,
"grad_norm": 0.37550196051597595,
"learning_rate": 0.00014419637159267024,
"loss": 2.0138,
"step": 89000
},
{
"epoch": 6.351894982002964,
"grad_norm": 0.3768686056137085,
"learning_rate": 0.0001414641261737624,
"loss": 2.0314,
"step": 90000
},
{
"epoch": 6.422471592914108,
"grad_norm": 0.3591078221797943,
"learning_rate": 0.00013872914577445526,
"loss": 2.0226,
"step": 91000
},
{
"epoch": 6.493048203825253,
"grad_norm": 0.3905663788318634,
"learning_rate": 0.00013599416537514813,
"loss": 2.0257,
"step": 92000
},
{
"epoch": 6.5636248147363965,
"grad_norm": 0.39147230982780457,
"learning_rate": 0.00013325918497584098,
"loss": 2.0311,
"step": 93000
},
{
"epoch": 6.63420142564754,
"grad_norm": 0.40250155329704285,
"learning_rate": 0.00013052420457653385,
"loss": 2.0301,
"step": 94000
},
{
"epoch": 6.704778036558684,
"grad_norm": 0.3860897123813629,
"learning_rate": 0.00012779195915762603,
"loss": 2.0312,
"step": 95000
},
{
"epoch": 6.775354647469829,
"grad_norm": 0.3707718253135681,
"learning_rate": 0.0001250597137387182,
"loss": 2.0282,
"step": 96000
},
{
"epoch": 6.845931258380973,
"grad_norm": 0.3690090775489807,
"learning_rate": 0.00012232473333941105,
"loss": 2.0255,
"step": 97000
},
{
"epoch": 6.916507869292117,
"grad_norm": 0.4174107313156128,
"learning_rate": 0.00011958975294010391,
"loss": 2.0276,
"step": 98000
},
{
"epoch": 6.987084480203261,
"grad_norm": 0.3740842938423157,
"learning_rate": 0.00011685750752119609,
"loss": 2.0233,
"step": 99000
},
{
"epoch": 7.0,
"eval_accuracy": 0.5303788443403243,
"eval_loss": 2.3779311180114746,
"eval_runtime": 124.6993,
"eval_samples_per_second": 376.514,
"eval_steps_per_second": 5.886,
"step": 99183
},
{
"epoch": 7.057661091114404,
"grad_norm": 0.37689074873924255,
"learning_rate": 0.00011412252712188895,
"loss": 1.9805,
"step": 100000
},
{
"epoch": 7.128237702025549,
"grad_norm": 0.4047756493091583,
"learning_rate": 0.00011139028170298112,
"loss": 1.9746,
"step": 101000
},
{
"epoch": 7.198814312936693,
"grad_norm": 0.3889460563659668,
"learning_rate": 0.00010865530130367397,
"loss": 1.9659,
"step": 102000
},
{
"epoch": 7.269390923847837,
"grad_norm": 0.4061487019062042,
"learning_rate": 0.00010592032090436684,
"loss": 1.9708,
"step": 103000
},
{
"epoch": 7.339967534758981,
"grad_norm": 0.4057160019874573,
"learning_rate": 0.0001031853405050597,
"loss": 1.9772,
"step": 104000
},
{
"epoch": 7.410544145670125,
"grad_norm": 0.40489134192466736,
"learning_rate": 0.00010045309508615188,
"loss": 1.9736,
"step": 105000
},
{
"epoch": 7.481120756581269,
"grad_norm": 0.4148654043674469,
"learning_rate": 9.771811468684473e-05,
"loss": 1.9796,
"step": 106000
},
{
"epoch": 7.551697367492413,
"grad_norm": 0.41483381390571594,
"learning_rate": 9.49831342875376e-05,
"loss": 1.9804,
"step": 107000
},
{
"epoch": 7.622273978403557,
"grad_norm": 0.43718773126602173,
"learning_rate": 9.225088886862978e-05,
"loss": 1.9712,
"step": 108000
},
{
"epoch": 7.692850589314701,
"grad_norm": 0.40646329522132874,
"learning_rate": 8.951590846932262e-05,
"loss": 1.9822,
"step": 109000
},
{
"epoch": 7.763427200225845,
"grad_norm": 0.44571158289909363,
"learning_rate": 8.67836630504148e-05,
"loss": 1.9832,
"step": 110000
},
{
"epoch": 7.83400381113699,
"grad_norm": 0.41726765036582947,
"learning_rate": 8.404868265110766e-05,
"loss": 1.9747,
"step": 111000
},
{
"epoch": 7.9045804220481335,
"grad_norm": 0.39210569858551025,
"learning_rate": 8.131370225180053e-05,
"loss": 1.9929,
"step": 112000
},
{
"epoch": 7.975157032959277,
"grad_norm": 0.37121346592903137,
"learning_rate": 7.858145683289268e-05,
"loss": 1.9727,
"step": 113000
},
{
"epoch": 8.0,
"eval_accuracy": 0.5315104156523147,
"eval_loss": 2.3731467723846436,
"eval_runtime": 124.474,
"eval_samples_per_second": 377.195,
"eval_steps_per_second": 5.897,
"step": 113352
},
{
"epoch": 8.045733643870422,
"grad_norm": 0.3864983916282654,
"learning_rate": 7.584647643358555e-05,
"loss": 1.9365,
"step": 114000
},
{
"epoch": 8.116310254781565,
"grad_norm": 0.4133272171020508,
"learning_rate": 7.311423101467773e-05,
"loss": 1.9174,
"step": 115000
},
{
"epoch": 8.18688686569271,
"grad_norm": 0.4471355676651001,
"learning_rate": 7.037925061537059e-05,
"loss": 1.9327,
"step": 116000
},
{
"epoch": 8.257463476603853,
"grad_norm": 0.42897623777389526,
"learning_rate": 6.764700519646275e-05,
"loss": 1.9288,
"step": 117000
},
{
"epoch": 8.328040087514998,
"grad_norm": 0.43864506483078003,
"learning_rate": 6.491202479715561e-05,
"loss": 1.9359,
"step": 118000
},
{
"epoch": 8.398616698426142,
"grad_norm": 0.46767184138298035,
"learning_rate": 6.217977937824779e-05,
"loss": 1.9399,
"step": 119000
},
{
"epoch": 8.469193309337285,
"grad_norm": 0.4159405827522278,
"learning_rate": 5.944479897894064e-05,
"loss": 1.9318,
"step": 120000
},
{
"epoch": 8.53976992024843,
"grad_norm": 0.4233142137527466,
"learning_rate": 5.671255356003281e-05,
"loss": 1.9316,
"step": 121000
},
{
"epoch": 8.610346531159573,
"grad_norm": 0.4398587942123413,
"learning_rate": 5.3980308141124983e-05,
"loss": 1.9349,
"step": 122000
},
{
"epoch": 8.680923142070718,
"grad_norm": 0.424790620803833,
"learning_rate": 5.124532774181785e-05,
"loss": 1.9382,
"step": 123000
},
{
"epoch": 8.751499752981863,
"grad_norm": 0.4141928553581238,
"learning_rate": 4.851034734251071e-05,
"loss": 1.9314,
"step": 124000
},
{
"epoch": 8.822076363893006,
"grad_norm": 0.45448678731918335,
"learning_rate": 4.577536694320357e-05,
"loss": 1.9374,
"step": 125000
},
{
"epoch": 8.89265297480415,
"grad_norm": 0.4196777939796448,
"learning_rate": 4.304312152429574e-05,
"loss": 1.9413,
"step": 126000
},
{
"epoch": 8.963229585715293,
"grad_norm": 0.3975803256034851,
"learning_rate": 4.03081411249886e-05,
"loss": 1.9311,
"step": 127000
},
{
"epoch": 9.0,
"eval_accuracy": 0.532392202583586,
"eval_loss": 2.3728187084198,
"eval_runtime": 124.6433,
"eval_samples_per_second": 376.683,
"eval_steps_per_second": 5.889,
"step": 127521
},
{
"epoch": 9.033806196626438,
"grad_norm": 0.42701438069343567,
"learning_rate": 3.757589570608077e-05,
"loss": 1.9097,
"step": 128000
},
{
"epoch": 9.104382807537583,
"grad_norm": 0.44095513224601746,
"learning_rate": 3.484091530677363e-05,
"loss": 1.8841,
"step": 129000
},
{
"epoch": 9.174959418448726,
"grad_norm": 0.4554646909236908,
"learning_rate": 3.2108669887865805e-05,
"loss": 1.898,
"step": 130000
},
{
"epoch": 9.24553602935987,
"grad_norm": 0.4306824803352356,
"learning_rate": 2.937642446895797e-05,
"loss": 1.8921,
"step": 131000
},
{
"epoch": 9.316112640271013,
"grad_norm": 0.4496276080608368,
"learning_rate": 2.664144406965083e-05,
"loss": 1.8883,
"step": 132000
},
{
"epoch": 9.386689251182158,
"grad_norm": 0.4314197301864624,
"learning_rate": 2.3906463670343695e-05,
"loss": 1.8972,
"step": 133000
},
{
"epoch": 9.457265862093303,
"grad_norm": 0.4485911428928375,
"learning_rate": 2.1171483271036556e-05,
"loss": 1.8908,
"step": 134000
},
{
"epoch": 9.527842473004446,
"grad_norm": 0.4166420102119446,
"learning_rate": 1.8439237852128724e-05,
"loss": 1.8924,
"step": 135000
},
{
"epoch": 9.59841908391559,
"grad_norm": 0.4447433650493622,
"learning_rate": 1.5704257452821588e-05,
"loss": 1.892,
"step": 136000
},
{
"epoch": 9.668995694826734,
"grad_norm": 0.42986035346984863,
"learning_rate": 1.2972012033913756e-05,
"loss": 1.8992,
"step": 137000
},
{
"epoch": 9.739572305737878,
"grad_norm": 0.45574498176574707,
"learning_rate": 1.0237031634606618e-05,
"loss": 1.8972,
"step": 138000
},
{
"epoch": 9.810148916649023,
"grad_norm": 0.4515238404273987,
"learning_rate": 7.5020512352994795e-06,
"loss": 1.8952,
"step": 139000
},
{
"epoch": 9.880725527560166,
"grad_norm": 0.43244990706443787,
"learning_rate": 4.769805816391649e-06,
"loss": 1.8946,
"step": 140000
},
{
"epoch": 9.951302138471311,
"grad_norm": 0.44382914900779724,
"learning_rate": 2.0348254170845108e-06,
"loss": 1.8943,
"step": 141000
},
{
"epoch": 10.0,
"eval_accuracy": 0.5327396962492645,
"eval_loss": 2.377573251724243,
"eval_runtime": 124.448,
"eval_samples_per_second": 377.274,
"eval_steps_per_second": 5.898,
"step": 141690
},
{
"epoch": 10.0,
"step": 141690,
"total_flos": 5.9695438005504e+17,
"train_loss": 2.237272139724321,
"train_runtime": 31036.7572,
"train_samples_per_second": 146.079,
"train_steps_per_second": 4.565
}
],
"logging_steps": 1000,
"max_steps": 141690,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.9695438005504e+17,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}