|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9980976537729864, |
|
"eval_steps": 500, |
|
"global_step": 1182, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.025364616360177554, |
|
"grad_norm": 2.4958535272810582, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7847, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05072923272035511, |
|
"grad_norm": 1.332301089693953, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6565, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07609384908053266, |
|
"grad_norm": 0.8570758334622178, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6087, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.10145846544071022, |
|
"grad_norm": 0.7961220667217911, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5923, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.12682308180088775, |
|
"grad_norm": 1.0397565268140934, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5693, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1521876981610653, |
|
"grad_norm": 0.8562621904197287, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5503, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.17755231452124287, |
|
"grad_norm": 0.8886037080258321, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5498, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.20291693088142043, |
|
"grad_norm": 0.6838633568199429, |
|
"learning_rate": 5e-06, |
|
"loss": 0.534, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.22828154724159797, |
|
"grad_norm": 0.6556349806544307, |
|
"learning_rate": 5e-06, |
|
"loss": 0.535, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2536461636017755, |
|
"grad_norm": 0.7469146203238889, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5333, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.27901077996195306, |
|
"grad_norm": 0.5122739271343442, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5172, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3043753963221306, |
|
"grad_norm": 0.9622908612260581, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5257, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3297400126823082, |
|
"grad_norm": 0.6046903397133303, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5151, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.35510462904248574, |
|
"grad_norm": 0.47575500456135494, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5143, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3804692454026633, |
|
"grad_norm": 0.7013205926571314, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5144, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.40583386176284086, |
|
"grad_norm": 0.6351422540630048, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5058, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.43119847812301837, |
|
"grad_norm": 0.6261693017885483, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5102, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.45656309448319593, |
|
"grad_norm": 0.7605740230985341, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5078, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4819277108433735, |
|
"grad_norm": 0.5845861272533613, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5043, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.507292327203551, |
|
"grad_norm": 0.7171099558889358, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5053, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5326569435637286, |
|
"grad_norm": 0.5158539718235372, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5066, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5580215599239061, |
|
"grad_norm": 0.7716179700630799, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5037, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5833861762840837, |
|
"grad_norm": 0.5356639716385265, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5077, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6087507926442612, |
|
"grad_norm": 0.5909560901543055, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4978, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6341154090044389, |
|
"grad_norm": 0.5612449176342577, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4955, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6594800253646164, |
|
"grad_norm": 0.8220158158926282, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4932, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6848446417247939, |
|
"grad_norm": 0.9803427711154427, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4935, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7102092580849715, |
|
"grad_norm": 0.7003489682973207, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4921, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.735573874445149, |
|
"grad_norm": 0.7155818668831541, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4941, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7609384908053266, |
|
"grad_norm": 0.5641884255018443, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4905, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7863031071655041, |
|
"grad_norm": 0.5667685684791592, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4972, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8116677235256817, |
|
"grad_norm": 0.5424782856163526, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4908, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8370323398858592, |
|
"grad_norm": 0.555119069867457, |
|
"learning_rate": 5e-06, |
|
"loss": 0.49, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8623969562460367, |
|
"grad_norm": 0.5540403091132209, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4892, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8877615726062144, |
|
"grad_norm": 0.6718528259146384, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4879, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9131261889663919, |
|
"grad_norm": 0.48504592421103015, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4866, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9384908053265695, |
|
"grad_norm": 0.5794400662308987, |
|
"learning_rate": 5e-06, |
|
"loss": 0.489, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.963855421686747, |
|
"grad_norm": 0.49175786205010735, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4792, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9892200380469245, |
|
"grad_norm": 0.48088824717550854, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4793, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9993658845909955, |
|
"eval_loss": 0.48458319902420044, |
|
"eval_runtime": 140.5522, |
|
"eval_samples_per_second": 75.552, |
|
"eval_steps_per_second": 0.591, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 1.014584654407102, |
|
"grad_norm": 0.6241434976553506, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4668, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0399492707672797, |
|
"grad_norm": 0.5387091155966651, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4467, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.0653138871274572, |
|
"grad_norm": 0.6088667420403366, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4552, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.0906785034876347, |
|
"grad_norm": 0.7635188991702534, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4569, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.1160431198478122, |
|
"grad_norm": 0.5202613636726365, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4532, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.1414077362079897, |
|
"grad_norm": 0.5431289298627378, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4552, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.1667723525681675, |
|
"grad_norm": 0.5447516747773636, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4517, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.192136968928345, |
|
"grad_norm": 0.5811733767557097, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4596, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.2175015852885225, |
|
"grad_norm": 0.5291374404256166, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4523, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.2428662016487, |
|
"grad_norm": 0.920406850160634, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4512, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.2682308180088775, |
|
"grad_norm": 0.5379277068224477, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4589, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.2935954343690552, |
|
"grad_norm": 0.6084288782824112, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4476, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.3189600507292327, |
|
"grad_norm": 0.6373203390142074, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4508, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.3443246670894102, |
|
"grad_norm": 0.5297816500484004, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4519, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.369689283449588, |
|
"grad_norm": 0.5214550304276996, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4507, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.3950538998097652, |
|
"grad_norm": 0.5932937282969459, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4508, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.420418516169943, |
|
"grad_norm": 0.5015573262400715, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4529, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.4457831325301205, |
|
"grad_norm": 0.6541003393290922, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4487, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.471147748890298, |
|
"grad_norm": 0.4738510019221813, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4437, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.4965123652504757, |
|
"grad_norm": 0.5284328908203406, |
|
"learning_rate": 5e-06, |
|
"loss": 0.449, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.521876981610653, |
|
"grad_norm": 0.5814801147707117, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4498, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.5472415979708307, |
|
"grad_norm": 0.7380939259733779, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4574, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.5726062143310082, |
|
"grad_norm": 0.5158189079851289, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4553, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.5979708306911857, |
|
"grad_norm": 0.7517859976181999, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4479, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.6233354470513635, |
|
"grad_norm": 0.4624484508717309, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4484, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.6487000634115407, |
|
"grad_norm": 0.6517886187802472, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4479, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.6740646797717185, |
|
"grad_norm": 0.5168694302612785, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4498, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.699429296131896, |
|
"grad_norm": 0.5442235822761647, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4546, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.7247939124920735, |
|
"grad_norm": 0.5866332538354704, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4502, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.7501585288522512, |
|
"grad_norm": 0.5771993285709256, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4489, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.7755231452124287, |
|
"grad_norm": 0.5856601574541924, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4516, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.8008877615726062, |
|
"grad_norm": 0.5219735572020098, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4504, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.8262523779327837, |
|
"grad_norm": 0.5294326989128105, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4512, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.8516169942929612, |
|
"grad_norm": 0.5043747110843602, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4476, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.876981610653139, |
|
"grad_norm": 0.5243372113736201, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4487, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.9023462270133165, |
|
"grad_norm": 0.46489075414726855, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4477, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.927710843373494, |
|
"grad_norm": 0.47070137502563003, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4491, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.9530754597336717, |
|
"grad_norm": 0.5114250833346574, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4497, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.978440076093849, |
|
"grad_norm": 0.44673587993328173, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4461, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.9987317691819912, |
|
"eval_loss": 0.4722590744495392, |
|
"eval_runtime": 136.3111, |
|
"eval_samples_per_second": 77.903, |
|
"eval_steps_per_second": 0.609, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 2.0038046924540267, |
|
"grad_norm": 0.5253387493826779, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4406, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.029169308814204, |
|
"grad_norm": 0.5649979072148124, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4174, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.0545339251743817, |
|
"grad_norm": 0.554555886277626, |
|
"learning_rate": 5e-06, |
|
"loss": 0.416, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.0798985415345594, |
|
"grad_norm": 0.5380213608538502, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4075, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.1052631578947367, |
|
"grad_norm": 0.6590004861365489, |
|
"learning_rate": 5e-06, |
|
"loss": 0.415, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.1306277742549145, |
|
"grad_norm": 0.5145129946467305, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4096, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.1559923906150917, |
|
"grad_norm": 0.572199886696882, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4189, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.1813570069752695, |
|
"grad_norm": 0.5756593969633285, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4201, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.206721623335447, |
|
"grad_norm": 0.5265898189979799, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4116, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.2320862396956245, |
|
"grad_norm": 0.5424672160350248, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4099, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.257450856055802, |
|
"grad_norm": 0.5674446384978195, |
|
"learning_rate": 5e-06, |
|
"loss": 0.416, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.2828154724159795, |
|
"grad_norm": 0.5128282183689237, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4157, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.308180088776157, |
|
"grad_norm": 0.5135015935006935, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4172, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.333544705136335, |
|
"grad_norm": 0.596189153928778, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4152, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.3589093214965122, |
|
"grad_norm": 0.5352826549369347, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4149, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.38427393785669, |
|
"grad_norm": 0.5014349895803593, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4141, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.4096385542168672, |
|
"grad_norm": 0.4805462505254729, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4176, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.435003170577045, |
|
"grad_norm": 0.6137290218711765, |
|
"learning_rate": 5e-06, |
|
"loss": 0.419, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.4603677869372227, |
|
"grad_norm": 0.5732682054062723, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4163, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.4857324032974, |
|
"grad_norm": 0.4771910554061346, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4134, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.5110970196575777, |
|
"grad_norm": 0.476330897847943, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4225, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.536461636017755, |
|
"grad_norm": 0.47973764991876255, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4145, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.5618262523779327, |
|
"grad_norm": 0.5939904213084772, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4153, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.5871908687381104, |
|
"grad_norm": 0.5936679428712734, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4204, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.6125554850982877, |
|
"grad_norm": 0.5188426106745951, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4183, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.6379201014584654, |
|
"grad_norm": 0.5644339619977095, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4126, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.6632847178186427, |
|
"grad_norm": 0.6020266606747191, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4186, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.6886493341788205, |
|
"grad_norm": 0.4752185053914476, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4138, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.714013950538998, |
|
"grad_norm": 0.7626568079783347, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4135, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.739378566899176, |
|
"grad_norm": 0.5108017704950135, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4154, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.764743183259353, |
|
"grad_norm": 0.5746749293115092, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4173, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.7901077996195305, |
|
"grad_norm": 0.5467822052037948, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4166, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.815472415979708, |
|
"grad_norm": 0.6357622704499519, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4198, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.840837032339886, |
|
"grad_norm": 0.7346508445377833, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4161, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.8662016487000637, |
|
"grad_norm": 0.4767595766550471, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4136, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.891566265060241, |
|
"grad_norm": 0.5450967603642648, |
|
"learning_rate": 5e-06, |
|
"loss": 0.416, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.9169308814204187, |
|
"grad_norm": 0.6310631600995659, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4156, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.942295497780596, |
|
"grad_norm": 0.4875236135766479, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4135, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.9676601141407737, |
|
"grad_norm": 0.5024341899279373, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4185, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.9930247305009514, |
|
"grad_norm": 0.4812185425989623, |
|
"learning_rate": 5e-06, |
|
"loss": 0.422, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.9980976537729864, |
|
"eval_loss": 0.4713599979877472, |
|
"eval_runtime": 132.9609, |
|
"eval_samples_per_second": 79.866, |
|
"eval_steps_per_second": 0.624, |
|
"step": 1182 |
|
}, |
|
{ |
|
"epoch": 2.9980976537729864, |
|
"step": 1182, |
|
"total_flos": 1979475264798720.0, |
|
"train_loss": 0.4632013658985067, |
|
"train_runtime": 20039.4082, |
|
"train_samples_per_second": 30.202, |
|
"train_steps_per_second": 0.059 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1182, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1979475264798720.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|