|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 141690, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.07057661091114405, |
|
"grad_norm": 0.8544681072235107, |
|
"learning_rate": 9.375e-06, |
|
"loss": 5.8075, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.1411532218222881, |
|
"grad_norm": 0.9348427057266235, |
|
"learning_rate": 1.875e-05, |
|
"loss": 3.9507, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.21172983273343213, |
|
"grad_norm": 0.9987612366676331, |
|
"learning_rate": 2.8125e-05, |
|
"loss": 3.6154, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.2823064436445762, |
|
"grad_norm": 0.9734600782394409, |
|
"learning_rate": 3.75e-05, |
|
"loss": 3.444, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.3528830545557202, |
|
"grad_norm": 0.9963154196739197, |
|
"learning_rate": 4.6874999999999994e-05, |
|
"loss": 3.2983, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.42345966546686425, |
|
"grad_norm": 0.8833392858505249, |
|
"learning_rate": 5.625e-05, |
|
"loss": 3.1806, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.49403627637800834, |
|
"grad_norm": 0.8395134806632996, |
|
"learning_rate": 6.5625e-05, |
|
"loss": 3.0762, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.5646128872891524, |
|
"grad_norm": 0.9056613445281982, |
|
"learning_rate": 7.5e-05, |
|
"loss": 3.0103, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.6351894982002965, |
|
"grad_norm": 0.8697331547737122, |
|
"learning_rate": 8.437499999999999e-05, |
|
"loss": 2.9146, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.7057661091114404, |
|
"grad_norm": 0.7743774056434631, |
|
"learning_rate": 9.374999999999999e-05, |
|
"loss": 2.8472, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.7763427200225845, |
|
"grad_norm": 0.769959568977356, |
|
"learning_rate": 0.00010312499999999999, |
|
"loss": 2.8093, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.8469193309337285, |
|
"grad_norm": 0.7926977872848511, |
|
"learning_rate": 0.0001125, |
|
"loss": 2.7498, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.9174959418448726, |
|
"grad_norm": 0.7256486415863037, |
|
"learning_rate": 0.000121865625, |
|
"loss": 2.714, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.9880725527560167, |
|
"grad_norm": 0.7287374138832092, |
|
"learning_rate": 0.000131240625, |
|
"loss": 2.6575, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.4749594473317534, |
|
"eval_loss": 2.8570854663848877, |
|
"eval_runtime": 123.5266, |
|
"eval_samples_per_second": 380.088, |
|
"eval_steps_per_second": 5.942, |
|
"step": 14169 |
|
}, |
|
{ |
|
"epoch": 1.0586491636671607, |
|
"grad_norm": 0.7397704124450684, |
|
"learning_rate": 0.00014060625, |
|
"loss": 2.6238, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.1292257745783048, |
|
"grad_norm": 0.7099004983901978, |
|
"learning_rate": 0.000149971875, |
|
"loss": 2.5914, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.1998023854894488, |
|
"grad_norm": 0.6785891056060791, |
|
"learning_rate": 0.000159346875, |
|
"loss": 2.5765, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.2703789964005927, |
|
"grad_norm": 0.6459276080131531, |
|
"learning_rate": 0.000168703125, |
|
"loss": 2.5484, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.340955607311737, |
|
"grad_norm": 0.6296180486679077, |
|
"learning_rate": 0.000178078125, |
|
"loss": 2.5329, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.4115322182228809, |
|
"grad_norm": 0.648757815361023, |
|
"learning_rate": 0.00018745312499999998, |
|
"loss": 2.5078, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.482108829134025, |
|
"grad_norm": 0.6126940250396729, |
|
"learning_rate": 0.00019681874999999998, |
|
"loss": 2.5066, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.552685440045169, |
|
"grad_norm": 0.5499350428581238, |
|
"learning_rate": 0.00020618437499999995, |
|
"loss": 2.4882, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.623262050956313, |
|
"grad_norm": 0.7012745141983032, |
|
"learning_rate": 0.00021555937499999998, |
|
"loss": 2.4746, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.6938386618674572, |
|
"grad_norm": 0.563831627368927, |
|
"learning_rate": 0.00022493437499999998, |
|
"loss": 2.4607, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.764415272778601, |
|
"grad_norm": 0.4928041696548462, |
|
"learning_rate": 0.00023430937499999997, |
|
"loss": 2.4454, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.8349918836897452, |
|
"grad_norm": 0.5389479398727417, |
|
"learning_rate": 0.00024367499999999997, |
|
"loss": 2.4429, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.9055684946008893, |
|
"grad_norm": 0.549089252948761, |
|
"learning_rate": 0.000253040625, |
|
"loss": 2.4239, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.9761451055120332, |
|
"grad_norm": 0.5027530193328857, |
|
"learning_rate": 0.000262415625, |
|
"loss": 2.4179, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.4989820084802377, |
|
"eval_loss": 2.6257801055908203, |
|
"eval_runtime": 124.6794, |
|
"eval_samples_per_second": 376.574, |
|
"eval_steps_per_second": 5.887, |
|
"step": 28338 |
|
}, |
|
{ |
|
"epoch": 2.0467217164231775, |
|
"grad_norm": 0.4511743485927582, |
|
"learning_rate": 0.000271790625, |
|
"loss": 2.3885, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 2.1172983273343213, |
|
"grad_norm": 0.4871021807193756, |
|
"learning_rate": 0.00028115624999999994, |
|
"loss": 2.378, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 2.1878749382454656, |
|
"grad_norm": 0.4743562638759613, |
|
"learning_rate": 0.00029053124999999994, |
|
"loss": 2.3721, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 2.2584515491566095, |
|
"grad_norm": 0.4668987989425659, |
|
"learning_rate": 0.00029990624999999993, |
|
"loss": 2.3584, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 2.3290281600677534, |
|
"grad_norm": 0.4822200834751129, |
|
"learning_rate": 0.0002972951043850852, |
|
"loss": 2.3592, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 2.3996047709788977, |
|
"grad_norm": 0.43820998072624207, |
|
"learning_rate": 0.0002945601239857781, |
|
"loss": 2.3459, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 2.4701813818900415, |
|
"grad_norm": 0.4586232602596283, |
|
"learning_rate": 0.0002918251435864709, |
|
"loss": 2.3499, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 2.5407579928011854, |
|
"grad_norm": 0.4139242172241211, |
|
"learning_rate": 0.0002890928981675631, |
|
"loss": 2.3302, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 2.6113346037123297, |
|
"grad_norm": 0.41075077652931213, |
|
"learning_rate": 0.000286357917768256, |
|
"loss": 2.3236, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 2.681911214623474, |
|
"grad_norm": 0.4315313994884491, |
|
"learning_rate": 0.00028362567234934815, |
|
"loss": 2.3239, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 2.752487825534618, |
|
"grad_norm": 0.39671897888183594, |
|
"learning_rate": 0.000280890691950041, |
|
"loss": 2.3087, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 2.8230644364457618, |
|
"grad_norm": 0.4033380150794983, |
|
"learning_rate": 0.00027815844653113317, |
|
"loss": 2.2952, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 2.893641047356906, |
|
"grad_norm": 0.36479175090789795, |
|
"learning_rate": 0.000275423466131826, |
|
"loss": 2.3049, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 2.96421765826805, |
|
"grad_norm": 0.3533291220664978, |
|
"learning_rate": 0.00027268848573251887, |
|
"loss": 2.286, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.5122970740171453, |
|
"eval_loss": 2.5088469982147217, |
|
"eval_runtime": 124.5088, |
|
"eval_samples_per_second": 377.09, |
|
"eval_steps_per_second": 5.895, |
|
"step": 42507 |
|
}, |
|
{ |
|
"epoch": 3.034794269179194, |
|
"grad_norm": 0.37031927704811096, |
|
"learning_rate": 0.00026995350533321177, |
|
"loss": 2.2529, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 3.105370880090338, |
|
"grad_norm": 0.39132222533226013, |
|
"learning_rate": 0.0002672185249339046, |
|
"loss": 2.2252, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 3.175947491001482, |
|
"grad_norm": 0.36771416664123535, |
|
"learning_rate": 0.0002644835445345975, |
|
"loss": 2.2389, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 3.2465241019126263, |
|
"grad_norm": 0.40541785955429077, |
|
"learning_rate": 0.00026175129911568964, |
|
"loss": 2.2224, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 3.31710071282377, |
|
"grad_norm": 0.4325103461742401, |
|
"learning_rate": 0.0002590190536967818, |
|
"loss": 2.2286, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 3.3876773237349145, |
|
"grad_norm": 0.3990887403488159, |
|
"learning_rate": 0.00025628407329747466, |
|
"loss": 2.216, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 3.4582539346460583, |
|
"grad_norm": 0.36805373430252075, |
|
"learning_rate": 0.00025354909289816756, |
|
"loss": 2.2209, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 3.528830545557202, |
|
"grad_norm": 0.357721209526062, |
|
"learning_rate": 0.0002508141124988604, |
|
"loss": 2.2218, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 3.5994071564683465, |
|
"grad_norm": 0.3424566984176636, |
|
"learning_rate": 0.00024808460206035186, |
|
"loss": 2.2159, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 3.6699837673794904, |
|
"grad_norm": 0.35358792543411255, |
|
"learning_rate": 0.00024534962166104476, |
|
"loss": 2.2085, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 3.7405603782906347, |
|
"grad_norm": 0.369150310754776, |
|
"learning_rate": 0.0002426146412617376, |
|
"loss": 2.2033, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 3.8111369892017786, |
|
"grad_norm": 0.3341532051563263, |
|
"learning_rate": 0.00023987966086243048, |
|
"loss": 2.2017, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 3.8817136001129224, |
|
"grad_norm": 0.40089789032936096, |
|
"learning_rate": 0.00023714741544352263, |
|
"loss": 2.2015, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 3.9522902110240667, |
|
"grad_norm": 0.35854268074035645, |
|
"learning_rate": 0.0002344124350442155, |
|
"loss": 2.2124, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.5203132962446899, |
|
"eval_loss": 2.444835662841797, |
|
"eval_runtime": 124.4717, |
|
"eval_samples_per_second": 377.202, |
|
"eval_steps_per_second": 5.897, |
|
"step": 56676 |
|
}, |
|
{ |
|
"epoch": 4.022866821935211, |
|
"grad_norm": 0.3867768347263336, |
|
"learning_rate": 0.00023167745464490835, |
|
"loss": 2.1839, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 4.093443432846355, |
|
"grad_norm": 0.3627295196056366, |
|
"learning_rate": 0.00022894520922600055, |
|
"loss": 2.1384, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 4.164020043757499, |
|
"grad_norm": 0.35046979784965515, |
|
"learning_rate": 0.00022621022882669337, |
|
"loss": 2.1439, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 4.234596654668643, |
|
"grad_norm": 0.3317088186740875, |
|
"learning_rate": 0.00022347524842738624, |
|
"loss": 2.1446, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 4.3051732655797865, |
|
"grad_norm": 0.37563446164131165, |
|
"learning_rate": 0.00022074300300847845, |
|
"loss": 2.1408, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 4.375749876490931, |
|
"grad_norm": 0.36360374093055725, |
|
"learning_rate": 0.00021800802260917127, |
|
"loss": 2.1447, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 4.446326487402075, |
|
"grad_norm": 0.35474300384521484, |
|
"learning_rate": 0.00021527577719026347, |
|
"loss": 2.1478, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 4.516903098313219, |
|
"grad_norm": 0.38771218061447144, |
|
"learning_rate": 0.00021254079679095632, |
|
"loss": 2.136, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 4.587479709224363, |
|
"grad_norm": 0.3860458433628082, |
|
"learning_rate": 0.0002098085513720485, |
|
"loss": 2.1379, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 4.658056320135507, |
|
"grad_norm": 0.3763484060764313, |
|
"learning_rate": 0.00020707357097274134, |
|
"loss": 2.1344, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 4.7286329310466515, |
|
"grad_norm": 0.37907466292381287, |
|
"learning_rate": 0.0002043413255538335, |
|
"loss": 2.1426, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 4.799209541957795, |
|
"grad_norm": 0.34690865874290466, |
|
"learning_rate": 0.00020160634515452636, |
|
"loss": 2.1208, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 4.869786152868939, |
|
"grad_norm": 0.36183568835258484, |
|
"learning_rate": 0.00019887409973561856, |
|
"loss": 2.1242, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 4.940362763780083, |
|
"grad_norm": 0.35947293043136597, |
|
"learning_rate": 0.00019613911933631138, |
|
"loss": 2.1307, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.5251367702083976, |
|
"eval_loss": 2.4099230766296387, |
|
"eval_runtime": 124.5048, |
|
"eval_samples_per_second": 377.102, |
|
"eval_steps_per_second": 5.895, |
|
"step": 70845 |
|
}, |
|
{ |
|
"epoch": 5.010939374691227, |
|
"grad_norm": 0.3575204312801361, |
|
"learning_rate": 0.00019340687391740358, |
|
"loss": 2.1265, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 5.081515985602372, |
|
"grad_norm": 0.38784071803092957, |
|
"learning_rate": 0.00019067189351809646, |
|
"loss": 2.064, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 5.152092596513516, |
|
"grad_norm": 0.34589263796806335, |
|
"learning_rate": 0.0001879396480991886, |
|
"loss": 2.0754, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 5.2226692074246595, |
|
"grad_norm": 0.3403594195842743, |
|
"learning_rate": 0.00018520466769988148, |
|
"loss": 2.073, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 5.293245818335803, |
|
"grad_norm": 0.38706710934638977, |
|
"learning_rate": 0.00018246968730057433, |
|
"loss": 2.0812, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 5.363822429246947, |
|
"grad_norm": 0.39309191703796387, |
|
"learning_rate": 0.0001797347069012672, |
|
"loss": 2.0915, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 5.434399040158092, |
|
"grad_norm": 0.37432822585105896, |
|
"learning_rate": 0.00017700246148235935, |
|
"loss": 2.0755, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 5.504975651069236, |
|
"grad_norm": 0.3538018465042114, |
|
"learning_rate": 0.00017426748108305222, |
|
"loss": 2.0772, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 5.57555226198038, |
|
"grad_norm": 0.3601301610469818, |
|
"learning_rate": 0.00017153523566414437, |
|
"loss": 2.085, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 5.6461288728915235, |
|
"grad_norm": 0.3469400703907013, |
|
"learning_rate": 0.00016880299024523657, |
|
"loss": 2.078, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 5.716705483802667, |
|
"grad_norm": 0.3623177111148834, |
|
"learning_rate": 0.00016607074482632872, |
|
"loss": 2.077, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 5.787282094713812, |
|
"grad_norm": 0.3382331132888794, |
|
"learning_rate": 0.0001633357644270216, |
|
"loss": 2.0745, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 5.857858705624956, |
|
"grad_norm": 0.3787217140197754, |
|
"learning_rate": 0.00016060078402771447, |
|
"loss": 2.0768, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 5.9284353165361, |
|
"grad_norm": 0.36773914098739624, |
|
"learning_rate": 0.00015786580362840732, |
|
"loss": 2.0756, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 5.999011927447244, |
|
"grad_norm": 0.39036858081817627, |
|
"learning_rate": 0.00015513082322910016, |
|
"loss": 2.0706, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.5280841264512295, |
|
"eval_loss": 2.388700246810913, |
|
"eval_runtime": 125.3022, |
|
"eval_samples_per_second": 374.702, |
|
"eval_steps_per_second": 5.858, |
|
"step": 85014 |
|
}, |
|
{ |
|
"epoch": 6.069588538358388, |
|
"grad_norm": 0.3616304099559784, |
|
"learning_rate": 0.00015239584282979304, |
|
"loss": 2.0123, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 6.140165149269532, |
|
"grad_norm": 0.37002402544021606, |
|
"learning_rate": 0.0001496635974108852, |
|
"loss": 2.0153, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 6.210741760180676, |
|
"grad_norm": 0.3603184223175049, |
|
"learning_rate": 0.0001469286170115781, |
|
"loss": 2.0199, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 6.28131837109182, |
|
"grad_norm": 0.37550196051597595, |
|
"learning_rate": 0.00014419637159267024, |
|
"loss": 2.0138, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 6.351894982002964, |
|
"grad_norm": 0.3768686056137085, |
|
"learning_rate": 0.0001414641261737624, |
|
"loss": 2.0314, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 6.422471592914108, |
|
"grad_norm": 0.3591078221797943, |
|
"learning_rate": 0.00013872914577445526, |
|
"loss": 2.0226, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 6.493048203825253, |
|
"grad_norm": 0.3905663788318634, |
|
"learning_rate": 0.00013599416537514813, |
|
"loss": 2.0257, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 6.5636248147363965, |
|
"grad_norm": 0.39147230982780457, |
|
"learning_rate": 0.00013325918497584098, |
|
"loss": 2.0311, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 6.63420142564754, |
|
"grad_norm": 0.40250155329704285, |
|
"learning_rate": 0.00013052420457653385, |
|
"loss": 2.0301, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 6.704778036558684, |
|
"grad_norm": 0.3860897123813629, |
|
"learning_rate": 0.00012779195915762603, |
|
"loss": 2.0312, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 6.775354647469829, |
|
"grad_norm": 0.3707718253135681, |
|
"learning_rate": 0.0001250597137387182, |
|
"loss": 2.0282, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 6.845931258380973, |
|
"grad_norm": 0.3690090775489807, |
|
"learning_rate": 0.00012232473333941105, |
|
"loss": 2.0255, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 6.916507869292117, |
|
"grad_norm": 0.4174107313156128, |
|
"learning_rate": 0.00011958975294010391, |
|
"loss": 2.0276, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 6.987084480203261, |
|
"grad_norm": 0.3740842938423157, |
|
"learning_rate": 0.00011685750752119609, |
|
"loss": 2.0233, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.5303788443403243, |
|
"eval_loss": 2.3779311180114746, |
|
"eval_runtime": 124.6993, |
|
"eval_samples_per_second": 376.514, |
|
"eval_steps_per_second": 5.886, |
|
"step": 99183 |
|
}, |
|
{ |
|
"epoch": 7.057661091114404, |
|
"grad_norm": 0.37689074873924255, |
|
"learning_rate": 0.00011412252712188895, |
|
"loss": 1.9805, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 7.128237702025549, |
|
"grad_norm": 0.4047756493091583, |
|
"learning_rate": 0.00011139028170298112, |
|
"loss": 1.9746, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 7.198814312936693, |
|
"grad_norm": 0.3889460563659668, |
|
"learning_rate": 0.00010865530130367397, |
|
"loss": 1.9659, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 7.269390923847837, |
|
"grad_norm": 0.4061487019062042, |
|
"learning_rate": 0.00010592032090436684, |
|
"loss": 1.9708, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 7.339967534758981, |
|
"grad_norm": 0.4057160019874573, |
|
"learning_rate": 0.0001031853405050597, |
|
"loss": 1.9772, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 7.410544145670125, |
|
"grad_norm": 0.40489134192466736, |
|
"learning_rate": 0.00010045309508615188, |
|
"loss": 1.9736, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 7.481120756581269, |
|
"grad_norm": 0.4148654043674469, |
|
"learning_rate": 9.771811468684473e-05, |
|
"loss": 1.9796, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 7.551697367492413, |
|
"grad_norm": 0.41483381390571594, |
|
"learning_rate": 9.49831342875376e-05, |
|
"loss": 1.9804, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 7.622273978403557, |
|
"grad_norm": 0.43718773126602173, |
|
"learning_rate": 9.225088886862978e-05, |
|
"loss": 1.9712, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 7.692850589314701, |
|
"grad_norm": 0.40646329522132874, |
|
"learning_rate": 8.951590846932262e-05, |
|
"loss": 1.9822, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 7.763427200225845, |
|
"grad_norm": 0.44571158289909363, |
|
"learning_rate": 8.67836630504148e-05, |
|
"loss": 1.9832, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 7.83400381113699, |
|
"grad_norm": 0.41726765036582947, |
|
"learning_rate": 8.404868265110766e-05, |
|
"loss": 1.9747, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 7.9045804220481335, |
|
"grad_norm": 0.39210569858551025, |
|
"learning_rate": 8.131370225180053e-05, |
|
"loss": 1.9929, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 7.975157032959277, |
|
"grad_norm": 0.37121346592903137, |
|
"learning_rate": 7.858145683289268e-05, |
|
"loss": 1.9727, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.5315104156523147, |
|
"eval_loss": 2.3731467723846436, |
|
"eval_runtime": 124.474, |
|
"eval_samples_per_second": 377.195, |
|
"eval_steps_per_second": 5.897, |
|
"step": 113352 |
|
}, |
|
{ |
|
"epoch": 8.045733643870422, |
|
"grad_norm": 0.3864983916282654, |
|
"learning_rate": 7.584647643358555e-05, |
|
"loss": 1.9365, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 8.116310254781565, |
|
"grad_norm": 0.4133272171020508, |
|
"learning_rate": 7.311423101467773e-05, |
|
"loss": 1.9174, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 8.18688686569271, |
|
"grad_norm": 0.4471355676651001, |
|
"learning_rate": 7.037925061537059e-05, |
|
"loss": 1.9327, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 8.257463476603853, |
|
"grad_norm": 0.42897623777389526, |
|
"learning_rate": 6.764700519646275e-05, |
|
"loss": 1.9288, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 8.328040087514998, |
|
"grad_norm": 0.43864506483078003, |
|
"learning_rate": 6.491202479715561e-05, |
|
"loss": 1.9359, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 8.398616698426142, |
|
"grad_norm": 0.46767184138298035, |
|
"learning_rate": 6.217977937824779e-05, |
|
"loss": 1.9399, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 8.469193309337285, |
|
"grad_norm": 0.4159405827522278, |
|
"learning_rate": 5.944479897894064e-05, |
|
"loss": 1.9318, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 8.53976992024843, |
|
"grad_norm": 0.4233142137527466, |
|
"learning_rate": 5.671255356003281e-05, |
|
"loss": 1.9316, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 8.610346531159573, |
|
"grad_norm": 0.4398587942123413, |
|
"learning_rate": 5.3980308141124983e-05, |
|
"loss": 1.9349, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 8.680923142070718, |
|
"grad_norm": 0.424790620803833, |
|
"learning_rate": 5.124532774181785e-05, |
|
"loss": 1.9382, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 8.751499752981863, |
|
"grad_norm": 0.4141928553581238, |
|
"learning_rate": 4.851034734251071e-05, |
|
"loss": 1.9314, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 8.822076363893006, |
|
"grad_norm": 0.45448678731918335, |
|
"learning_rate": 4.577536694320357e-05, |
|
"loss": 1.9374, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 8.89265297480415, |
|
"grad_norm": 0.4196777939796448, |
|
"learning_rate": 4.304312152429574e-05, |
|
"loss": 1.9413, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 8.963229585715293, |
|
"grad_norm": 0.3975803256034851, |
|
"learning_rate": 4.03081411249886e-05, |
|
"loss": 1.9311, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.532392202583586, |
|
"eval_loss": 2.3728187084198, |
|
"eval_runtime": 124.6433, |
|
"eval_samples_per_second": 376.683, |
|
"eval_steps_per_second": 5.889, |
|
"step": 127521 |
|
}, |
|
{ |
|
"epoch": 9.033806196626438, |
|
"grad_norm": 0.42701438069343567, |
|
"learning_rate": 3.757589570608077e-05, |
|
"loss": 1.9097, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 9.104382807537583, |
|
"grad_norm": 0.44095513224601746, |
|
"learning_rate": 3.484091530677363e-05, |
|
"loss": 1.8841, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 9.174959418448726, |
|
"grad_norm": 0.4554646909236908, |
|
"learning_rate": 3.2108669887865805e-05, |
|
"loss": 1.898, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 9.24553602935987, |
|
"grad_norm": 0.4306824803352356, |
|
"learning_rate": 2.937642446895797e-05, |
|
"loss": 1.8921, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 9.316112640271013, |
|
"grad_norm": 0.4496276080608368, |
|
"learning_rate": 2.664144406965083e-05, |
|
"loss": 1.8883, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 9.386689251182158, |
|
"grad_norm": 0.4314197301864624, |
|
"learning_rate": 2.3906463670343695e-05, |
|
"loss": 1.8972, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 9.457265862093303, |
|
"grad_norm": 0.4485911428928375, |
|
"learning_rate": 2.1171483271036556e-05, |
|
"loss": 1.8908, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 9.527842473004446, |
|
"grad_norm": 0.4166420102119446, |
|
"learning_rate": 1.8439237852128724e-05, |
|
"loss": 1.8924, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 9.59841908391559, |
|
"grad_norm": 0.4447433650493622, |
|
"learning_rate": 1.5704257452821588e-05, |
|
"loss": 1.892, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 9.668995694826734, |
|
"grad_norm": 0.42986035346984863, |
|
"learning_rate": 1.2972012033913756e-05, |
|
"loss": 1.8992, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 9.739572305737878, |
|
"grad_norm": 0.45574498176574707, |
|
"learning_rate": 1.0237031634606618e-05, |
|
"loss": 1.8972, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 9.810148916649023, |
|
"grad_norm": 0.4515238404273987, |
|
"learning_rate": 7.5020512352994795e-06, |
|
"loss": 1.8952, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 9.880725527560166, |
|
"grad_norm": 0.43244990706443787, |
|
"learning_rate": 4.769805816391649e-06, |
|
"loss": 1.8946, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 9.951302138471311, |
|
"grad_norm": 0.44382914900779724, |
|
"learning_rate": 2.0348254170845108e-06, |
|
"loss": 1.8943, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.5327396962492645, |
|
"eval_loss": 2.377573251724243, |
|
"eval_runtime": 124.448, |
|
"eval_samples_per_second": 377.274, |
|
"eval_steps_per_second": 5.898, |
|
"step": 141690 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 141690, |
|
"total_flos": 5.9695438005504e+17, |
|
"train_loss": 2.237272139724321, |
|
"train_runtime": 31036.7572, |
|
"train_samples_per_second": 146.079, |
|
"train_steps_per_second": 4.565 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 141690, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 5000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.9695438005504e+17, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|