|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.993238166791886, |
|
"eval_steps": 500, |
|
"global_step": 996, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03005259203606311, |
|
"grad_norm": 4.810495218440688, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7926, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06010518407212622, |
|
"grad_norm": 3.155508530359899, |
|
"learning_rate": 5e-06, |
|
"loss": 0.722, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09015777610818933, |
|
"grad_norm": 1.225544271167285, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7085, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.12021036814425244, |
|
"grad_norm": 0.9265761014813488, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6829, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.15026296018031554, |
|
"grad_norm": 1.2106246073988878, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6706, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.18031555221637866, |
|
"grad_norm": 1.9380737235516812, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6637, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.21036814425244177, |
|
"grad_norm": 0.8111060604176487, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6536, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.24042073628850488, |
|
"grad_norm": 0.9088296756504328, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6441, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.270473328324568, |
|
"grad_norm": 0.6375258367082703, |
|
"learning_rate": 5e-06, |
|
"loss": 0.636, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3005259203606311, |
|
"grad_norm": 1.2329946577337598, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6293, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3305785123966942, |
|
"grad_norm": 0.5905978981231776, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6255, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3606311044327573, |
|
"grad_norm": 0.5247850741074309, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6275, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.39068369646882045, |
|
"grad_norm": 0.6903136425418634, |
|
"learning_rate": 5e-06, |
|
"loss": 0.635, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.42073628850488354, |
|
"grad_norm": 0.5428640707055408, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6283, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4507888805409467, |
|
"grad_norm": 0.7725320921008048, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6265, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.48084147257700977, |
|
"grad_norm": 0.6353293754243331, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6219, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5108940646130729, |
|
"grad_norm": 0.696998211557625, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6222, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.540946656649136, |
|
"grad_norm": 0.5725933318808639, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6233, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5709992486851991, |
|
"grad_norm": 0.5406370000400791, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6141, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6010518407212622, |
|
"grad_norm": 0.5328195355513519, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6135, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6311044327573253, |
|
"grad_norm": 0.5757875631327323, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6177, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6611570247933884, |
|
"grad_norm": 0.511696604236356, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6221, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6912096168294516, |
|
"grad_norm": 0.5535745253744225, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6093, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7212622088655146, |
|
"grad_norm": 0.5080323201706937, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6198, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7513148009015778, |
|
"grad_norm": 0.7440297135028378, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6141, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7813673929376409, |
|
"grad_norm": 0.5240609426616543, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6001, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8114199849737039, |
|
"grad_norm": 0.5170330245787976, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6077, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8414725770097671, |
|
"grad_norm": 0.469645994556069, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6158, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8715251690458302, |
|
"grad_norm": 0.48882124388027265, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6114, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9015777610818934, |
|
"grad_norm": 0.5404896464902132, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6075, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9316303531179564, |
|
"grad_norm": 0.5064337097380566, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5973, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9616829451540195, |
|
"grad_norm": 0.5395666354340096, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6142, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.9917355371900827, |
|
"grad_norm": 0.5443646281574408, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6024, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.9977460555972952, |
|
"eval_loss": 0.602512001991272, |
|
"eval_runtime": 115.6071, |
|
"eval_samples_per_second": 77.547, |
|
"eval_steps_per_second": 0.614, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 1.0217881292261457, |
|
"grad_norm": 0.6043688308355755, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5718, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.051840721262209, |
|
"grad_norm": 0.5742469748119633, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5559, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.081893313298272, |
|
"grad_norm": 0.5641150930410421, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5565, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.111945905334335, |
|
"grad_norm": 0.5251750994837489, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5559, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.1419984973703983, |
|
"grad_norm": 0.5594749881743232, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5548, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.1720510894064613, |
|
"grad_norm": 0.5192017193387347, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5616, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.2021036814425243, |
|
"grad_norm": 0.5006444275848907, |
|
"learning_rate": 5e-06, |
|
"loss": 0.553, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2321562734785876, |
|
"grad_norm": 0.5182596004381461, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5534, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.2622088655146506, |
|
"grad_norm": 0.48792630443547186, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5596, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.2922614575507136, |
|
"grad_norm": 0.4971407424850785, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5559, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.322314049586777, |
|
"grad_norm": 0.4769582306533125, |
|
"learning_rate": 5e-06, |
|
"loss": 0.559, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.35236664162284, |
|
"grad_norm": 0.5696514756007727, |
|
"learning_rate": 5e-06, |
|
"loss": 0.559, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.382419233658903, |
|
"grad_norm": 0.47028266891611087, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5552, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.4124718256949662, |
|
"grad_norm": 0.5142628164984128, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5562, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.4425244177310292, |
|
"grad_norm": 0.4298738340578577, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5444, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.4725770097670925, |
|
"grad_norm": 0.5886177624554544, |
|
"learning_rate": 5e-06, |
|
"loss": 0.553, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.5026296018031555, |
|
"grad_norm": 0.479969456761664, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5615, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.5326821938392188, |
|
"grad_norm": 0.546008279918396, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5588, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.5627347858752816, |
|
"grad_norm": 0.5113023466894115, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5515, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.5927873779113448, |
|
"grad_norm": 0.4638757502509985, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5602, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.622839969947408, |
|
"grad_norm": 0.5396179709473905, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5687, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.6528925619834711, |
|
"grad_norm": 0.5307757961312712, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5522, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.6829451540195342, |
|
"grad_norm": 0.5717833229926992, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5539, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.7129977460555974, |
|
"grad_norm": 0.5148441797108223, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5486, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.7430503380916604, |
|
"grad_norm": 0.4744973222154149, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5515, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.7731029301277235, |
|
"grad_norm": 0.4644006036526441, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5529, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.8031555221637867, |
|
"grad_norm": 0.5742071808595426, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5563, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.8332081141998497, |
|
"grad_norm": 0.524857373914898, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5548, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.8632607062359128, |
|
"grad_norm": 0.5108979951578987, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5517, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.893313298271976, |
|
"grad_norm": 0.5651055404638835, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5566, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.923365890308039, |
|
"grad_norm": 0.4596795240284601, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5565, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.953418482344102, |
|
"grad_norm": 0.5231843841522166, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5543, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.9834710743801653, |
|
"grad_norm": 0.4613490632623934, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5526, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.998497370398197, |
|
"eval_loss": 0.5943902730941772, |
|
"eval_runtime": 116.0841, |
|
"eval_samples_per_second": 77.229, |
|
"eval_steps_per_second": 0.612, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 2.0135236664162286, |
|
"grad_norm": 1.0151560958110475, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5296, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.0435762584522914, |
|
"grad_norm": 0.5753358656630801, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5029, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.0736288504883547, |
|
"grad_norm": 0.535229224584348, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5033, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.103681442524418, |
|
"grad_norm": 0.5308477913499555, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4981, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.1337340345604807, |
|
"grad_norm": 0.5032127296654162, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5025, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.163786626596544, |
|
"grad_norm": 0.5692227883516503, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5052, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.1938392186326072, |
|
"grad_norm": 0.5408850500925536, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4995, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.22389181066867, |
|
"grad_norm": 0.47006049113767784, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5053, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.2539444027047333, |
|
"grad_norm": 0.6577893214388708, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5068, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.2839969947407965, |
|
"grad_norm": 0.5453995235461279, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5025, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.3140495867768593, |
|
"grad_norm": 0.5111820993930344, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5025, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.3441021788129226, |
|
"grad_norm": 0.5019244840316214, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5059, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.374154770848986, |
|
"grad_norm": 0.4860932571878682, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5057, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.4042073628850487, |
|
"grad_norm": 0.5105307181041621, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5104, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.434259954921112, |
|
"grad_norm": 0.4985867132406256, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5078, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.464312546957175, |
|
"grad_norm": 0.5460937499191226, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5091, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.494365138993238, |
|
"grad_norm": 0.4887915411999051, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5084, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.5244177310293012, |
|
"grad_norm": 0.48194363042727245, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5041, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.5544703230653645, |
|
"grad_norm": 0.5346871895488505, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5041, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.5845229151014273, |
|
"grad_norm": 0.5741375180255713, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5044, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.6145755071374905, |
|
"grad_norm": 0.5215520764416209, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5113, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.644628099173554, |
|
"grad_norm": 0.5134970783379779, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5037, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.674680691209617, |
|
"grad_norm": 0.5673341470254984, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5126, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.70473328324568, |
|
"grad_norm": 0.5345905680232906, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5071, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.734785875281743, |
|
"grad_norm": 0.6651222249433006, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5069, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.764838467317806, |
|
"grad_norm": 0.5593066350235659, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5105, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.794891059353869, |
|
"grad_norm": 0.45810869599006515, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5058, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.8249436513899324, |
|
"grad_norm": 0.5097170540621323, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5086, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.8549962434259957, |
|
"grad_norm": 0.61115109062032, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5097, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.8850488354620585, |
|
"grad_norm": 0.5416071063853659, |
|
"learning_rate": 5e-06, |
|
"loss": 0.506, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.9151014274981217, |
|
"grad_norm": 0.5010400411025377, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5084, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.945154019534185, |
|
"grad_norm": 0.505134630851122, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5054, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.975206611570248, |
|
"grad_norm": 0.5247938588776304, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5067, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.993238166791886, |
|
"eval_loss": 0.6004215478897095, |
|
"eval_runtime": 113.1633, |
|
"eval_samples_per_second": 79.222, |
|
"eval_steps_per_second": 0.627, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 2.993238166791886, |
|
"step": 996, |
|
"total_flos": 1667918337146880.0, |
|
"train_loss": 0.5658263341490045, |
|
"train_runtime": 16943.5847, |
|
"train_samples_per_second": 30.156, |
|
"train_steps_per_second": 0.059 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 996, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1667918337146880.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|