|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.992874109263658, |
|
"eval_steps": 500, |
|
"global_step": 945, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03167062549485352, |
|
"grad_norm": 1.8530284877370395, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8166, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06334125098970704, |
|
"grad_norm": 1.5337420189302975, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7464, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09501187648456057, |
|
"grad_norm": 1.4029937472724519, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7243, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.12668250197941408, |
|
"grad_norm": 0.9360700334222073, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7146, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1583531274742676, |
|
"grad_norm": 0.994137693760135, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6895, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.19002375296912113, |
|
"grad_norm": 0.9347536063077458, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6778, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.22169437846397466, |
|
"grad_norm": 1.4913361116122625, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6594, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.25336500395882816, |
|
"grad_norm": 0.7126258978453617, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6637, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2850356294536817, |
|
"grad_norm": 0.9899069063293856, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6456, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3167062549485352, |
|
"grad_norm": 0.8045731477224973, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6537, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.34837688044338877, |
|
"grad_norm": 0.7578866653732541, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6481, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.38004750593824227, |
|
"grad_norm": 0.5676407147308244, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6419, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4117181314330958, |
|
"grad_norm": 0.8078622210774858, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6479, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4433887569279493, |
|
"grad_norm": 0.7647299971015771, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6439, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4750593824228028, |
|
"grad_norm": 0.5575084199732572, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6388, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5067300079176563, |
|
"grad_norm": 0.5128045187828547, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6347, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5384006334125099, |
|
"grad_norm": 0.8861637952775342, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6354, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5700712589073634, |
|
"grad_norm": 0.7708516251715861, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6365, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.601741884402217, |
|
"grad_norm": 0.6980158820500171, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6339, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6334125098970704, |
|
"grad_norm": 0.5994527923990384, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6384, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.665083135391924, |
|
"grad_norm": 1.2967423762938355, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6369, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6967537608867775, |
|
"grad_norm": 0.47535091611506025, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6321, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.728424386381631, |
|
"grad_norm": 0.6353037224955166, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6241, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7600950118764845, |
|
"grad_norm": 0.6601833755256151, |
|
"learning_rate": 5e-06, |
|
"loss": 0.623, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7917656373713381, |
|
"grad_norm": 0.6122026802056224, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6263, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8234362628661916, |
|
"grad_norm": 0.5075094381024849, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6301, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8551068883610451, |
|
"grad_norm": 0.5422031773802576, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6292, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8867775138558986, |
|
"grad_norm": 0.42516469360864606, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6177, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.9184481393507522, |
|
"grad_norm": 0.7465232508313683, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6197, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9501187648456056, |
|
"grad_norm": 0.5164060465761252, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6286, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9817893903404592, |
|
"grad_norm": 0.6125601748565044, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6301, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.997624703087886, |
|
"eval_loss": 0.6208310723304749, |
|
"eval_runtime": 170.2387, |
|
"eval_samples_per_second": 49.965, |
|
"eval_steps_per_second": 0.394, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.0134600158353126, |
|
"grad_norm": 0.8577950377026092, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6042, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.0451306413301662, |
|
"grad_norm": 0.563239613051996, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5769, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.0768012668250198, |
|
"grad_norm": 0.5011250455738395, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5753, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.1084718923198733, |
|
"grad_norm": 0.507779510609032, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5755, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.1401425178147269, |
|
"grad_norm": 0.5864390033721468, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5731, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.1718131433095804, |
|
"grad_norm": 0.5651111499269338, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5687, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.203483768804434, |
|
"grad_norm": 0.6150558670403593, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5729, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.2351543942992875, |
|
"grad_norm": 0.5808136283436993, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5718, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.2668250197941409, |
|
"grad_norm": 0.5830305205374973, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5761, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2984956452889944, |
|
"grad_norm": 0.5467014072656838, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5799, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.330166270783848, |
|
"grad_norm": 0.5214522227982674, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5734, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.3618368962787015, |
|
"grad_norm": 0.7855825418949094, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5712, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.393507521773555, |
|
"grad_norm": 0.5656871920545978, |
|
"learning_rate": 5e-06, |
|
"loss": 0.576, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.4251781472684084, |
|
"grad_norm": 0.5944245365102998, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5708, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.4568487727632622, |
|
"grad_norm": 0.5333261966632318, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5671, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.4885193982581155, |
|
"grad_norm": 0.491104016024968, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5743, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.520190023752969, |
|
"grad_norm": 0.5014163060947014, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5728, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.5518606492478226, |
|
"grad_norm": 0.4892383437610088, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5802, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.5835312747426762, |
|
"grad_norm": 0.4989580296588211, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5751, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.6152019002375297, |
|
"grad_norm": 0.616735446474154, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5716, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.646872525732383, |
|
"grad_norm": 0.5387044899734632, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5751, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.6785431512272369, |
|
"grad_norm": 0.5526914425760312, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5684, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.7102137767220902, |
|
"grad_norm": 0.6545410316398991, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5702, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.7418844022169437, |
|
"grad_norm": 0.5254133456615973, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5674, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.7735550277117973, |
|
"grad_norm": 0.521939360967283, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5724, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.8052256532066508, |
|
"grad_norm": 0.463070184825861, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5818, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.8368962787015044, |
|
"grad_norm": 0.6381561992890791, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5688, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.8685669041963577, |
|
"grad_norm": 0.6496360141809046, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5646, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.9002375296912115, |
|
"grad_norm": 0.5324989809239741, |
|
"learning_rate": 5e-06, |
|
"loss": 0.566, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.9319081551860648, |
|
"grad_norm": 0.5111765382529705, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5708, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.9635787806809184, |
|
"grad_norm": 0.5610135178229989, |
|
"learning_rate": 5e-06, |
|
"loss": 0.574, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.995249406175772, |
|
"grad_norm": 0.6915334000048331, |
|
"learning_rate": 5e-06, |
|
"loss": 0.58, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.9984164687252575, |
|
"eval_loss": 0.6117470860481262, |
|
"eval_runtime": 171.4849, |
|
"eval_samples_per_second": 49.602, |
|
"eval_steps_per_second": 0.391, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 2.0269200316706253, |
|
"grad_norm": 0.616676539774963, |
|
"learning_rate": 5e-06, |
|
"loss": 0.528, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.058590657165479, |
|
"grad_norm": 0.6073136045120692, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5195, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.0902612826603324, |
|
"grad_norm": 0.5470687590939235, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5258, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.121931908155186, |
|
"grad_norm": 0.5476384369380188, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5207, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.1536025336500395, |
|
"grad_norm": 0.5807654736471419, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5229, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.1852731591448933, |
|
"grad_norm": 0.5658725818639244, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5155, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.2169437846397466, |
|
"grad_norm": 0.5270348810540799, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5257, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.2486144101346, |
|
"grad_norm": 0.6710965861689625, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5148, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.2802850356294537, |
|
"grad_norm": 0.5096235564504402, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5181, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.311955661124307, |
|
"grad_norm": 0.5550769313636378, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5171, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.343626286619161, |
|
"grad_norm": 0.7122071616039661, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5181, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.375296912114014, |
|
"grad_norm": 0.5158288008674774, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5255, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.406967537608868, |
|
"grad_norm": 0.5744842064968558, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5232, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.4386381631037213, |
|
"grad_norm": 0.48569927712881583, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5231, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.470308788598575, |
|
"grad_norm": 0.6323853806061133, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5199, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.5019794140934284, |
|
"grad_norm": 0.5506449580962853, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5192, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.5336500395882817, |
|
"grad_norm": 0.4790039532718334, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5179, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.5653206650831355, |
|
"grad_norm": 0.5390096335172841, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5269, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.596991290577989, |
|
"grad_norm": 0.5217848609501268, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5229, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.6286619160728426, |
|
"grad_norm": 0.5196866879306132, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5217, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.660332541567696, |
|
"grad_norm": 0.5333118708551611, |
|
"learning_rate": 5e-06, |
|
"loss": 0.525, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.6920031670625493, |
|
"grad_norm": 0.5483398687319767, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5224, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.723673792557403, |
|
"grad_norm": 0.5117263088559509, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5283, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.7553444180522564, |
|
"grad_norm": 0.5377436931762913, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5232, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.78701504354711, |
|
"grad_norm": 0.5662585202942261, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5274, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.8186856690419635, |
|
"grad_norm": 0.48954404185612554, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5273, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.850356294536817, |
|
"grad_norm": 0.5387435059221906, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5181, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.8820269200316706, |
|
"grad_norm": 0.5630237900783817, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5228, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.9136975455265244, |
|
"grad_norm": 0.4995116097709295, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5247, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.9453681710213777, |
|
"grad_norm": 0.49890480333211984, |
|
"learning_rate": 5e-06, |
|
"loss": 0.521, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.977038796516231, |
|
"grad_norm": 0.5217828385490265, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5261, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.992874109263658, |
|
"eval_loss": 0.6157404184341431, |
|
"eval_runtime": 171.3558, |
|
"eval_samples_per_second": 49.639, |
|
"eval_steps_per_second": 0.391, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 2.992874109263658, |
|
"step": 945, |
|
"total_flos": 1582491437629440.0, |
|
"train_loss": 0.5832562999119835, |
|
"train_runtime": 28511.6322, |
|
"train_samples_per_second": 17.004, |
|
"train_steps_per_second": 0.033 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 945, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1582491437629440.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|