|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.998551424432641, |
|
"eval_steps": 500, |
|
"global_step": 3105, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.009657170449058426, |
|
"grad_norm": 3.7815811768441914, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6253, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01931434089811685, |
|
"grad_norm": 2.097627838391192, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5439, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.028971511347175277, |
|
"grad_norm": 1.991134226246532, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5217, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0386286817962337, |
|
"grad_norm": 1.786626694141357, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5096, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04828585224529213, |
|
"grad_norm": 1.717098828666389, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5039, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05794302269435055, |
|
"grad_norm": 1.8094322954355757, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5015, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.06760019314340898, |
|
"grad_norm": 2.0392174555732336, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4883, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0772573635924674, |
|
"grad_norm": 1.9677308677580745, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4874, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.08691453404152583, |
|
"grad_norm": 1.7766414325618487, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4874, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.09657170449058426, |
|
"grad_norm": 1.6816893104701698, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4803, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.10622887493964268, |
|
"grad_norm": 1.4814895014838179, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4781, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.1158860453887011, |
|
"grad_norm": 1.809417684888777, |
|
"learning_rate": 5e-06, |
|
"loss": 0.477, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.12554321583775954, |
|
"grad_norm": 1.604499406008116, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4745, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.13520038628681796, |
|
"grad_norm": 1.5342478528722183, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4733, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.14485755673587639, |
|
"grad_norm": 1.7155349659817856, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4733, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.1545147271849348, |
|
"grad_norm": 1.4873226511627278, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4717, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.16417189763399323, |
|
"grad_norm": 1.5335342617066339, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4703, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.17382906808305165, |
|
"grad_norm": 1.692809745357256, |
|
"learning_rate": 5e-06, |
|
"loss": 0.473, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1834862385321101, |
|
"grad_norm": 1.6215225290403987, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4713, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.19314340898116852, |
|
"grad_norm": 1.61595338484294, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4666, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.20280057943022695, |
|
"grad_norm": 1.535971453026849, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4644, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.21245774987928537, |
|
"grad_norm": 1.448430643756855, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4583, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2221149203283438, |
|
"grad_norm": 1.6067011202263324, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4575, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2317720907774022, |
|
"grad_norm": 1.4643615572820308, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4554, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.24142926122646063, |
|
"grad_norm": 1.5756879099170888, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4661, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.2510864316755191, |
|
"grad_norm": 1.6966284718047921, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4586, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2607436021245775, |
|
"grad_norm": 1.6310274076314688, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4616, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.27040077257363593, |
|
"grad_norm": 1.5908628903198843, |
|
"learning_rate": 5e-06, |
|
"loss": 0.456, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.2800579430226944, |
|
"grad_norm": 1.5321607540064421, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4632, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.28971511347175277, |
|
"grad_norm": 1.5432080707621136, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4498, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2993722839208112, |
|
"grad_norm": 1.5174884163746274, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4547, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.3090294543698696, |
|
"grad_norm": 1.4431913116342714, |
|
"learning_rate": 5e-06, |
|
"loss": 0.448, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.31868662481892807, |
|
"grad_norm": 1.7071593067951876, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4493, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.32834379526798646, |
|
"grad_norm": 1.3857947339153756, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4545, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.3380009657170449, |
|
"grad_norm": 1.4688790780164955, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4465, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3476581361661033, |
|
"grad_norm": 1.4620765033414924, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4495, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.35731530661516175, |
|
"grad_norm": 1.4806186634505536, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4455, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.3669724770642202, |
|
"grad_norm": 1.3726165431422352, |
|
"learning_rate": 5e-06, |
|
"loss": 0.446, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.3766296475132786, |
|
"grad_norm": 1.902972844618989, |
|
"learning_rate": 5e-06, |
|
"loss": 0.447, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.38628681796233705, |
|
"grad_norm": 1.4905365903207408, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4446, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.39594398841139544, |
|
"grad_norm": 1.4193622471558158, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4481, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.4056011588604539, |
|
"grad_norm": 1.5124518593909522, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4417, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.4152583293095123, |
|
"grad_norm": 1.3927026205868633, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4469, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.42491549975857074, |
|
"grad_norm": 1.4506580301027323, |
|
"learning_rate": 5e-06, |
|
"loss": 0.442, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.4345726702076292, |
|
"grad_norm": 1.3495438486710758, |
|
"learning_rate": 5e-06, |
|
"loss": 0.446, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.4442298406566876, |
|
"grad_norm": 1.5206970544390066, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4419, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.45388701110574603, |
|
"grad_norm": 1.4858134923328608, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4543, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.4635441815548044, |
|
"grad_norm": 1.5043934135687507, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4459, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.4732013520038629, |
|
"grad_norm": 2.2482606991957463, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4426, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.48285852245292127, |
|
"grad_norm": 1.4045208076303264, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4326, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.4925156929019797, |
|
"grad_norm": 1.543363161753652, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4439, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5021728633510382, |
|
"grad_norm": 1.3653299013378564, |
|
"learning_rate": 5e-06, |
|
"loss": 0.436, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.5118300338000966, |
|
"grad_norm": 1.4150482204508787, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4378, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.521487204249155, |
|
"grad_norm": 1.3973876180346887, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4395, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.5311443746982134, |
|
"grad_norm": 1.4601941048467661, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4409, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.5408015451472719, |
|
"grad_norm": 1.367282105296493, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4365, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.5504587155963303, |
|
"grad_norm": 1.4142842428381743, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4368, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.5601158860453888, |
|
"grad_norm": 1.5238374949968105, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4351, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.5697730564944471, |
|
"grad_norm": 1.4426020649884985, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4378, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.5794302269435055, |
|
"grad_norm": 1.6034191206258377, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4359, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.589087397392564, |
|
"grad_norm": 1.3861019948416724, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4366, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.5987445678416224, |
|
"grad_norm": 1.5575620851838663, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4293, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.6084017382906808, |
|
"grad_norm": 1.4940232627496768, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4308, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.6180589087397392, |
|
"grad_norm": 1.3940687230128848, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4344, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.6277160791887977, |
|
"grad_norm": 1.4063585473842182, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4312, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.6373732496378561, |
|
"grad_norm": 1.4263057370102823, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4288, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.6470304200869146, |
|
"grad_norm": 1.359338730182779, |
|
"learning_rate": 5e-06, |
|
"loss": 0.432, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.6566875905359729, |
|
"grad_norm": 1.4201113340211817, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4271, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.6663447609850314, |
|
"grad_norm": 1.3945845556957837, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4297, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.6760019314340898, |
|
"grad_norm": 1.4879889305917073, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4275, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.6856591018831483, |
|
"grad_norm": 1.5684295486014674, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4318, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.6953162723322066, |
|
"grad_norm": 1.6080556978629539, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4327, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.7049734427812651, |
|
"grad_norm": 1.4517662054077567, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4316, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.7146306132303235, |
|
"grad_norm": 1.2929610515185452, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4294, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.724287783679382, |
|
"grad_norm": 1.4669951814767501, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4295, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.7339449541284404, |
|
"grad_norm": 1.3523170842518906, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4323, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.7436021245774987, |
|
"grad_norm": 1.4848119403639475, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4307, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.7532592950265572, |
|
"grad_norm": 1.2865027479600895, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4302, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.7629164654756156, |
|
"grad_norm": 1.3394197026037133, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4274, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.7725736359246741, |
|
"grad_norm": 1.4058992660952825, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4315, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.7822308063737325, |
|
"grad_norm": 1.4021237990828046, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4258, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.7918879768227909, |
|
"grad_norm": 1.4089771533232405, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4334, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.8015451472718493, |
|
"grad_norm": 1.3600046812066533, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4225, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.8112023177209078, |
|
"grad_norm": 1.4685840247095097, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4251, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.8208594881699662, |
|
"grad_norm": 1.3854952044022746, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4305, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.8305166586190246, |
|
"grad_norm": 1.3373112968229082, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4266, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.840173829068083, |
|
"grad_norm": 1.4141453052618997, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4276, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.8498309995171415, |
|
"grad_norm": 1.5086378917784917, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4247, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.8594881699661999, |
|
"grad_norm": 7.458182848059056, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4299, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.8691453404152584, |
|
"grad_norm": 1.5061946368232628, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4207, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.8788025108643167, |
|
"grad_norm": 1.2888259560849804, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4224, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.8884596813133752, |
|
"grad_norm": 1.4278585748957382, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4222, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.8981168517624336, |
|
"grad_norm": 1.4321653215479198, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4242, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.9077740222114921, |
|
"grad_norm": 1.3716154862902383, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4225, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.9174311926605505, |
|
"grad_norm": 1.3700235242916874, |
|
"learning_rate": 5e-06, |
|
"loss": 0.421, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.9270883631096088, |
|
"grad_norm": 1.3005175365880353, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4191, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.9367455335586673, |
|
"grad_norm": 1.3469399837188125, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4215, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.9464027040077257, |
|
"grad_norm": 1.3797432696350105, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4265, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.9560598744567842, |
|
"grad_norm": 1.2977225244391166, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4215, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.9657170449058425, |
|
"grad_norm": 1.2662429869365561, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4227, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.975374215354901, |
|
"grad_norm": 1.3086127516497834, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4178, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.9850313858039594, |
|
"grad_norm": 1.3519105272191883, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4186, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.9946885562530179, |
|
"grad_norm": 1.301344809598359, |
|
"learning_rate": 5e-06, |
|
"loss": 0.417, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.9995171414775471, |
|
"eval_loss": 0.4187374413013458, |
|
"eval_runtime": 182.0762, |
|
"eval_samples_per_second": 153.249, |
|
"eval_steps_per_second": 0.599, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 1.0043457267020763, |
|
"grad_norm": 2.0938717237725024, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3807, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.0140028971511348, |
|
"grad_norm": 1.6443261926343353, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3165, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.0236600676001932, |
|
"grad_norm": 1.3639427703718043, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3142, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.0333172380492515, |
|
"grad_norm": 1.4375989800345246, |
|
"learning_rate": 5e-06, |
|
"loss": 0.315, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.04297440849831, |
|
"grad_norm": 1.608018512856855, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3105, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.0526315789473684, |
|
"grad_norm": 1.6910926571603153, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3156, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.0622887493964268, |
|
"grad_norm": 1.501323197263165, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3214, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.0719459198454853, |
|
"grad_norm": 1.5198024912737862, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3187, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.0816030902945437, |
|
"grad_norm": 1.3967412194416047, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3177, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.0912602607436022, |
|
"grad_norm": 1.5169020279572174, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3194, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.1009174311926606, |
|
"grad_norm": 1.436092584626518, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3184, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.110574601641719, |
|
"grad_norm": 1.5133593820810858, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3209, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.1202317720907775, |
|
"grad_norm": 1.4358744153476208, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3162, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.1298889425398357, |
|
"grad_norm": 1.550919605032498, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3176, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.1395461129888942, |
|
"grad_norm": 1.4353404806567058, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3206, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.1492032834379526, |
|
"grad_norm": 1.6213115971080414, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3189, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.158860453887011, |
|
"grad_norm": 1.4735029754670448, |
|
"learning_rate": 5e-06, |
|
"loss": 0.318, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.1685176243360695, |
|
"grad_norm": 1.4914905722949483, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3259, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.178174794785128, |
|
"grad_norm": 1.4734223688553245, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3197, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.1878319652341864, |
|
"grad_norm": 1.4303986315832913, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3152, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.1974891356832449, |
|
"grad_norm": 1.641460216526965, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3203, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.2071463061323033, |
|
"grad_norm": 1.4854108071397198, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3187, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.2168034765813616, |
|
"grad_norm": 1.5657183004768043, |
|
"learning_rate": 5e-06, |
|
"loss": 0.321, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.22646064703042, |
|
"grad_norm": 1.5392245759709497, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3199, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.2361178174794785, |
|
"grad_norm": 1.4716569015272043, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3167, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.245774987928537, |
|
"grad_norm": 1.4273719929805762, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3203, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.2554321583775954, |
|
"grad_norm": 1.437096297299039, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3211, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.2650893288266538, |
|
"grad_norm": 1.4122099703029305, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3182, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.2747464992757123, |
|
"grad_norm": 1.62630879502805, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3217, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.2844036697247707, |
|
"grad_norm": 1.516391222863939, |
|
"learning_rate": 5e-06, |
|
"loss": 0.319, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.2940608401738292, |
|
"grad_norm": 1.9043021829524946, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3201, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.3037180106228874, |
|
"grad_norm": 1.468668927245736, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3185, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.3133751810719458, |
|
"grad_norm": 1.5192911853076112, |
|
"learning_rate": 5e-06, |
|
"loss": 0.321, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.3230323515210043, |
|
"grad_norm": 1.4775016904206435, |
|
"learning_rate": 5e-06, |
|
"loss": 0.323, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.3326895219700627, |
|
"grad_norm": 1.5738282394014576, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3244, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.3423466924191212, |
|
"grad_norm": 1.451975577099937, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3231, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.3520038628681796, |
|
"grad_norm": 1.509638768808278, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3238, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.361661033317238, |
|
"grad_norm": 1.4711514260625576, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3246, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.3713182037662965, |
|
"grad_norm": 1.5617390029956357, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3224, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.380975374215355, |
|
"grad_norm": 1.5052132010129833, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3225, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.3906325446644132, |
|
"grad_norm": 1.5656411569392583, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3151, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.4002897151134719, |
|
"grad_norm": 1.4546198678024953, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3186, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.4099468855625301, |
|
"grad_norm": 1.4514681189539558, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3252, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.4196040560115886, |
|
"grad_norm": 1.5187373109976763, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3218, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.429261226460647, |
|
"grad_norm": 1.4206099886595174, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3224, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.4389183969097055, |
|
"grad_norm": 1.3933469991291707, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3261, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.448575567358764, |
|
"grad_norm": 1.5275153572741944, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3196, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.4582327378078224, |
|
"grad_norm": 1.502387340983924, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3202, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.4678899082568808, |
|
"grad_norm": 1.4367426017329248, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3246, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.477547078705939, |
|
"grad_norm": 1.50109980877967, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3232, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.4872042491549977, |
|
"grad_norm": 1.6172108349800483, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3261, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.496861419604056, |
|
"grad_norm": 1.44998024463031, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3223, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.5065185900531144, |
|
"grad_norm": 1.4396628484827616, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3207, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.5161757605021728, |
|
"grad_norm": 1.4505846726645737, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3238, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.5258329309512313, |
|
"grad_norm": 1.3991156805400873, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3205, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.5354901014002897, |
|
"grad_norm": 1.8677762783077632, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3258, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.5451472718493482, |
|
"grad_norm": 1.5966624238860196, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3253, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.5548044422984066, |
|
"grad_norm": 1.4666709592985163, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3197, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.5644616127474649, |
|
"grad_norm": 1.5429686092632175, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3243, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.5741187831965235, |
|
"grad_norm": 1.6197267190074782, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3194, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.5837759536455818, |
|
"grad_norm": 1.5284451091184315, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3195, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.5934331240946402, |
|
"grad_norm": 1.4818015371419853, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3252, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.6030902945436987, |
|
"grad_norm": 1.5230446062018719, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3256, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.6127474649927571, |
|
"grad_norm": 1.3984458497949286, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3251, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.6224046354418156, |
|
"grad_norm": 1.3753762689022815, |
|
"learning_rate": 5e-06, |
|
"loss": 0.318, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.632061805890874, |
|
"grad_norm": 1.4229134627752649, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3206, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.6417189763399325, |
|
"grad_norm": 1.6325248655032423, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3254, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.6513761467889907, |
|
"grad_norm": 1.460452232456223, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3256, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.6610333172380494, |
|
"grad_norm": 1.4967322575333943, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3221, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.6706904876871076, |
|
"grad_norm": 1.4589595336063315, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3247, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.6803476581361663, |
|
"grad_norm": 1.535858835340216, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3218, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.6900048285852245, |
|
"grad_norm": 1.5876620484010096, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3232, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.699661999034283, |
|
"grad_norm": 1.4347951889442994, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3223, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.7093191694833414, |
|
"grad_norm": 1.462429783459939, |
|
"learning_rate": 5e-06, |
|
"loss": 0.321, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.7189763399323998, |
|
"grad_norm": 1.4922559510321622, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3231, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.7286335103814583, |
|
"grad_norm": 1.467959135285837, |
|
"learning_rate": 5e-06, |
|
"loss": 0.328, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.7382906808305165, |
|
"grad_norm": 1.49130361205012, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3266, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.7479478512795752, |
|
"grad_norm": 2.106032317323986, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3251, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.7576050217286334, |
|
"grad_norm": 1.543093994781833, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3243, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.767262192177692, |
|
"grad_norm": 1.4767806762226672, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3222, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.7769193626267503, |
|
"grad_norm": 1.6074675928640145, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3243, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.7865765330758088, |
|
"grad_norm": 1.586893521300227, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3271, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.7962337035248672, |
|
"grad_norm": 1.5199781278405553, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3226, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.8058908739739257, |
|
"grad_norm": 1.7072009333034235, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3251, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.8155480444229841, |
|
"grad_norm": 1.4657197536244577, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3232, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.8252052148720423, |
|
"grad_norm": 1.5502462503355572, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3241, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.834862385321101, |
|
"grad_norm": 1.4268551559294822, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3233, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.8445195557701592, |
|
"grad_norm": 1.5487349032646411, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3248, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.854176726219218, |
|
"grad_norm": 1.5587365739097951, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3262, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.8638338966682761, |
|
"grad_norm": 1.6410211222118756, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3243, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.8734910671173346, |
|
"grad_norm": 1.4412258241860316, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3242, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.883148237566393, |
|
"grad_norm": 1.408887524672963, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3207, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.8928054080154515, |
|
"grad_norm": 1.709744732252996, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3192, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.90246257846451, |
|
"grad_norm": 1.53624861095146, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3234, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.9121197489135682, |
|
"grad_norm": 1.4790016911744388, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3239, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.9217769193626268, |
|
"grad_norm": 1.4505596932174116, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3226, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.931434089811685, |
|
"grad_norm": 1.3770134690086049, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3253, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.9410912602607437, |
|
"grad_norm": 1.6550530625579105, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3255, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.950748430709802, |
|
"grad_norm": 1.5935808137814957, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3236, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 1.9604056011588604, |
|
"grad_norm": 1.5070382250631085, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3314, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 1.9700627716079189, |
|
"grad_norm": 1.3793762943653258, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3231, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.9797199420569773, |
|
"grad_norm": 1.4658674183812177, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3247, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.9893771125060358, |
|
"grad_norm": 1.5804425363669086, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3286, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 1.999034282955094, |
|
"grad_norm": 1.550360674186604, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3265, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.4136686325073242, |
|
"eval_runtime": 178.6997, |
|
"eval_samples_per_second": 156.145, |
|
"eval_steps_per_second": 0.61, |
|
"step": 2071 |
|
}, |
|
{ |
|
"epoch": 2.0086914534041527, |
|
"grad_norm": 1.9820970989127087, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2226, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 2.018348623853211, |
|
"grad_norm": 1.746604108675904, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2022, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 2.0280057943022696, |
|
"grad_norm": 1.738922693622109, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2034, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.037662964751328, |
|
"grad_norm": 1.842908561465572, |
|
"learning_rate": 5e-06, |
|
"loss": 0.1987, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 2.0473201352003865, |
|
"grad_norm": 1.555963693945732, |
|
"learning_rate": 5e-06, |
|
"loss": 0.1998, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 2.0569773056494447, |
|
"grad_norm": 1.6105373644152785, |
|
"learning_rate": 5e-06, |
|
"loss": 0.1977, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 2.066634476098503, |
|
"grad_norm": 1.7277918775624639, |
|
"learning_rate": 5e-06, |
|
"loss": 0.1997, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 2.0762916465475616, |
|
"grad_norm": 1.7710659996843952, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2002, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.08594881699662, |
|
"grad_norm": 1.6510171974053387, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2025, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 2.0956059874456785, |
|
"grad_norm": 1.7020580751172847, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2005, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 2.1052631578947367, |
|
"grad_norm": 2.0101369980825474, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2076, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 2.1149203283437954, |
|
"grad_norm": 1.6487612820560622, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2031, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 2.1245774987928536, |
|
"grad_norm": 1.6667015280913091, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2042, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.1342346692419123, |
|
"grad_norm": 1.6702800446638546, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2016, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 2.1438918396909705, |
|
"grad_norm": 1.5921726575950466, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2017, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 2.153549010140029, |
|
"grad_norm": 1.6461690389920633, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2015, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 2.1632061805890874, |
|
"grad_norm": 1.7782309168905173, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2012, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 2.1728633510381457, |
|
"grad_norm": 1.768405330134614, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2058, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.1825205214872043, |
|
"grad_norm": 1.8294577091973667, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2039, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 2.1921776919362626, |
|
"grad_norm": 1.6757794760632077, |
|
"learning_rate": 5e-06, |
|
"loss": 0.206, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 2.2018348623853212, |
|
"grad_norm": 1.674322596519805, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2073, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 2.2114920328343795, |
|
"grad_norm": 1.729559352602187, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2082, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 2.221149203283438, |
|
"grad_norm": 1.7521877519603215, |
|
"learning_rate": 5e-06, |
|
"loss": 0.206, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.2308063737324964, |
|
"grad_norm": 1.6806851822041886, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2022, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 2.240463544181555, |
|
"grad_norm": 1.7855843714445576, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2092, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 2.2501207146306133, |
|
"grad_norm": 1.6890806520431167, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2089, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 2.2597778850796715, |
|
"grad_norm": 1.6876559538674738, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2076, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 2.26943505552873, |
|
"grad_norm": 1.866952527448764, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2101, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.2790922259777884, |
|
"grad_norm": 1.7778130347386727, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2092, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 2.288749396426847, |
|
"grad_norm": 1.7573632813881142, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2092, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 2.2984065668759053, |
|
"grad_norm": 1.670588490011592, |
|
"learning_rate": 5e-06, |
|
"loss": 0.21, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 2.308063737324964, |
|
"grad_norm": 2.0244107541981857, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2121, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 2.317720907774022, |
|
"grad_norm": 1.7228769196960538, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2065, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.327378078223081, |
|
"grad_norm": 1.7316110503319007, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2079, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 2.337035248672139, |
|
"grad_norm": 1.7859669994070049, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2106, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 2.3466924191211973, |
|
"grad_norm": 1.6790885546872112, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2134, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 2.356349589570256, |
|
"grad_norm": 1.658832098468064, |
|
"learning_rate": 5e-06, |
|
"loss": 0.209, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 2.366006760019314, |
|
"grad_norm": 1.737362445041373, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2086, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.375663930468373, |
|
"grad_norm": 1.627103813908181, |
|
"learning_rate": 5e-06, |
|
"loss": 0.214, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 2.385321100917431, |
|
"grad_norm": 1.7557722068636354, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2119, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 2.3949782713664898, |
|
"grad_norm": 1.676762758309891, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2143, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 2.404635441815548, |
|
"grad_norm": 1.8055582761883138, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2138, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 2.4142926122646067, |
|
"grad_norm": 1.8126267211804017, |
|
"learning_rate": 5e-06, |
|
"loss": 0.213, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.423949782713665, |
|
"grad_norm": 1.737365446135157, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2101, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 2.433606953162723, |
|
"grad_norm": 1.8601108967097126, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2113, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 2.443264123611782, |
|
"grad_norm": 1.7975172388473801, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2121, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 2.45292129406084, |
|
"grad_norm": 1.6548943065520165, |
|
"learning_rate": 5e-06, |
|
"loss": 0.213, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 2.4625784645098987, |
|
"grad_norm": 1.6142344009099507, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2101, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.472235634958957, |
|
"grad_norm": 1.7136093491402586, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2132, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 2.4818928054080156, |
|
"grad_norm": 1.6928112777667244, |
|
"learning_rate": 5e-06, |
|
"loss": 0.214, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 2.491549975857074, |
|
"grad_norm": 1.7487627188812938, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2121, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 2.5012071463061325, |
|
"grad_norm": 1.7611973606308302, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2146, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 2.5108643167551907, |
|
"grad_norm": 1.7432835398393358, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2131, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.520521487204249, |
|
"grad_norm": 1.914665414646886, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2135, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 2.5301786576533076, |
|
"grad_norm": 1.7235392993581606, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2139, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 2.539835828102366, |
|
"grad_norm": 1.8401761899916762, |
|
"learning_rate": 5e-06, |
|
"loss": 0.216, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 2.5494929985514245, |
|
"grad_norm": 1.7238390715314391, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2136, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 2.5591501690004828, |
|
"grad_norm": 1.6967163269012748, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2135, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 2.5688073394495414, |
|
"grad_norm": 1.6547200950158283, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2154, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 2.5784645098985997, |
|
"grad_norm": 1.7345144961212824, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2194, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 2.5881216803476583, |
|
"grad_norm": 1.695025030426855, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2148, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 2.5977788507967166, |
|
"grad_norm": 1.6586909849777336, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2177, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 2.607436021245775, |
|
"grad_norm": 1.6450495177327662, |
|
"learning_rate": 5e-06, |
|
"loss": 0.217, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.6170931916948335, |
|
"grad_norm": 1.75951484894314, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2161, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 2.6267503621438917, |
|
"grad_norm": 1.6771766744486507, |
|
"learning_rate": 5e-06, |
|
"loss": 0.216, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 2.6364075325929504, |
|
"grad_norm": 1.6522749567618924, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2164, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 2.6460647030420086, |
|
"grad_norm": 1.6904676125685896, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2183, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 2.6557218734910673, |
|
"grad_norm": 1.9074890209165016, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2185, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 2.6653790439401255, |
|
"grad_norm": 1.7817515462190572, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2166, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 2.675036214389184, |
|
"grad_norm": 1.7562563485880531, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2184, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 2.6846933848382424, |
|
"grad_norm": 1.998249733627222, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2207, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 2.6943505552873006, |
|
"grad_norm": 1.7695683095416619, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2154, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 2.7040077257363593, |
|
"grad_norm": 1.8078600319718185, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2194, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.7136648961854175, |
|
"grad_norm": 1.8027624003524778, |
|
"learning_rate": 5e-06, |
|
"loss": 0.216, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 2.723322066634476, |
|
"grad_norm": 1.6742700740217098, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2208, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 2.7329792370835344, |
|
"grad_norm": 1.773334112408393, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2188, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 2.742636407532593, |
|
"grad_norm": 1.6731213070854867, |
|
"learning_rate": 5e-06, |
|
"loss": 0.222, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 2.7522935779816513, |
|
"grad_norm": 1.7120912589030635, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2174, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 2.76195074843071, |
|
"grad_norm": 1.7123222485174125, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2177, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 2.771607918879768, |
|
"grad_norm": 1.6545568282400362, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2176, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 2.7812650893288264, |
|
"grad_norm": 2.0273088669056385, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2177, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 2.790922259777885, |
|
"grad_norm": 1.6066656179070626, |
|
"learning_rate": 5e-06, |
|
"loss": 0.219, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 2.8005794302269438, |
|
"grad_norm": 1.8053174623133685, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2176, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.810236600676002, |
|
"grad_norm": 1.70700256545585, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2196, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 2.8198937711250602, |
|
"grad_norm": 1.8175496500932709, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2207, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 2.829550941574119, |
|
"grad_norm": 1.7332734928998839, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2192, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 2.839208112023177, |
|
"grad_norm": 1.6737893973481872, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2198, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 2.848865282472236, |
|
"grad_norm": 1.861536272319189, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2211, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 2.858522452921294, |
|
"grad_norm": 1.7745633627018984, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2242, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 2.8681796233703523, |
|
"grad_norm": 1.7269651603359077, |
|
"learning_rate": 5e-06, |
|
"loss": 0.221, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 2.877836793819411, |
|
"grad_norm": 1.8445740625163256, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2205, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 2.8874939642684696, |
|
"grad_norm": 1.7256657735965188, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2187, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 2.897151134717528, |
|
"grad_norm": 1.7947034947358826, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2217, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.906808305166586, |
|
"grad_norm": 1.7396418325368443, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2183, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 2.9164654756156447, |
|
"grad_norm": 1.6566041516835608, |
|
"learning_rate": 5e-06, |
|
"loss": 0.224, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 2.926122646064703, |
|
"grad_norm": 1.7285392977136116, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2209, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 2.9357798165137616, |
|
"grad_norm": 1.6972125749702298, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2222, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 2.94543698696282, |
|
"grad_norm": 1.7429187695624144, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2197, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 2.955094157411878, |
|
"grad_norm": 1.7068313963220236, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2219, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 2.9647513278609368, |
|
"grad_norm": 1.6955378543746138, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2212, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 2.9744084983099954, |
|
"grad_norm": 1.685143060439258, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2233, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 2.9840656687590537, |
|
"grad_norm": 1.7308667414857535, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2226, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 2.993722839208112, |
|
"grad_norm": 1.7145293308716394, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2217, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.998551424432641, |
|
"eval_loss": 0.44667962193489075, |
|
"eval_runtime": 176.8031, |
|
"eval_samples_per_second": 157.82, |
|
"eval_steps_per_second": 0.617, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 2.998551424432641, |
|
"step": 3105, |
|
"total_flos": 5200153128468480.0, |
|
"train_loss": 0.3276555372321087, |
|
"train_runtime": 26364.0416, |
|
"train_samples_per_second": 60.326, |
|
"train_steps_per_second": 0.118 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3105, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5200153128468480.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|