{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9980976537729864, "eval_steps": 500, "global_step": 1182, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025364616360177554, "grad_norm": 2.4958535272810582, "learning_rate": 5e-06, "loss": 0.7847, "step": 10 }, { "epoch": 0.05072923272035511, "grad_norm": 1.332301089693953, "learning_rate": 5e-06, "loss": 0.6565, "step": 20 }, { "epoch": 0.07609384908053266, "grad_norm": 0.8570758334622178, "learning_rate": 5e-06, "loss": 0.6087, "step": 30 }, { "epoch": 0.10145846544071022, "grad_norm": 0.7961220667217911, "learning_rate": 5e-06, "loss": 0.5923, "step": 40 }, { "epoch": 0.12682308180088775, "grad_norm": 1.0397565268140934, "learning_rate": 5e-06, "loss": 0.5693, "step": 50 }, { "epoch": 0.1521876981610653, "grad_norm": 0.8562621904197287, "learning_rate": 5e-06, "loss": 0.5503, "step": 60 }, { "epoch": 0.17755231452124287, "grad_norm": 0.8886037080258321, "learning_rate": 5e-06, "loss": 0.5498, "step": 70 }, { "epoch": 0.20291693088142043, "grad_norm": 0.6838633568199429, "learning_rate": 5e-06, "loss": 0.534, "step": 80 }, { "epoch": 0.22828154724159797, "grad_norm": 0.6556349806544307, "learning_rate": 5e-06, "loss": 0.535, "step": 90 }, { "epoch": 0.2536461636017755, "grad_norm": 0.7469146203238889, "learning_rate": 5e-06, "loss": 0.5333, "step": 100 }, { "epoch": 0.27901077996195306, "grad_norm": 0.5122739271343442, "learning_rate": 5e-06, "loss": 0.5172, "step": 110 }, { "epoch": 0.3043753963221306, "grad_norm": 0.9622908612260581, "learning_rate": 5e-06, "loss": 0.5257, "step": 120 }, { "epoch": 0.3297400126823082, "grad_norm": 0.6046903397133303, "learning_rate": 5e-06, "loss": 0.5151, "step": 130 }, { "epoch": 0.35510462904248574, "grad_norm": 0.47575500456135494, "learning_rate": 5e-06, "loss": 0.5143, "step": 140 }, { "epoch": 0.3804692454026633, "grad_norm": 0.7013205926571314, "learning_rate": 5e-06, "loss": 0.5144, "step": 150 }, { "epoch": 0.40583386176284086, "grad_norm": 0.6351422540630048, "learning_rate": 5e-06, "loss": 0.5058, "step": 160 }, { "epoch": 0.43119847812301837, "grad_norm": 0.6261693017885483, "learning_rate": 5e-06, "loss": 0.5102, "step": 170 }, { "epoch": 0.45656309448319593, "grad_norm": 0.7605740230985341, "learning_rate": 5e-06, "loss": 0.5078, "step": 180 }, { "epoch": 0.4819277108433735, "grad_norm": 0.5845861272533613, "learning_rate": 5e-06, "loss": 0.5043, "step": 190 }, { "epoch": 0.507292327203551, "grad_norm": 0.7171099558889358, "learning_rate": 5e-06, "loss": 0.5053, "step": 200 }, { "epoch": 0.5326569435637286, "grad_norm": 0.5158539718235372, "learning_rate": 5e-06, "loss": 0.5066, "step": 210 }, { "epoch": 0.5580215599239061, "grad_norm": 0.7716179700630799, "learning_rate": 5e-06, "loss": 0.5037, "step": 220 }, { "epoch": 0.5833861762840837, "grad_norm": 0.5356639716385265, "learning_rate": 5e-06, "loss": 0.5077, "step": 230 }, { "epoch": 0.6087507926442612, "grad_norm": 0.5909560901543055, "learning_rate": 5e-06, "loss": 0.4978, "step": 240 }, { "epoch": 0.6341154090044389, "grad_norm": 0.5612449176342577, "learning_rate": 5e-06, "loss": 0.4955, "step": 250 }, { "epoch": 0.6594800253646164, "grad_norm": 0.8220158158926282, "learning_rate": 5e-06, "loss": 0.4932, "step": 260 }, { "epoch": 0.6848446417247939, "grad_norm": 0.9803427711154427, "learning_rate": 5e-06, "loss": 0.4935, "step": 270 }, { "epoch": 0.7102092580849715, "grad_norm": 0.7003489682973207, "learning_rate": 5e-06, "loss": 0.4921, "step": 280 }, { "epoch": 0.735573874445149, "grad_norm": 0.7155818668831541, "learning_rate": 5e-06, "loss": 0.4941, "step": 290 }, { "epoch": 0.7609384908053266, "grad_norm": 0.5641884255018443, "learning_rate": 5e-06, "loss": 0.4905, "step": 300 }, { "epoch": 0.7863031071655041, "grad_norm": 0.5667685684791592, "learning_rate": 5e-06, "loss": 0.4972, "step": 310 }, { "epoch": 0.8116677235256817, "grad_norm": 0.5424782856163526, "learning_rate": 5e-06, "loss": 0.4908, "step": 320 }, { "epoch": 0.8370323398858592, "grad_norm": 0.555119069867457, "learning_rate": 5e-06, "loss": 0.49, "step": 330 }, { "epoch": 0.8623969562460367, "grad_norm": 0.5540403091132209, "learning_rate": 5e-06, "loss": 0.4892, "step": 340 }, { "epoch": 0.8877615726062144, "grad_norm": 0.6718528259146384, "learning_rate": 5e-06, "loss": 0.4879, "step": 350 }, { "epoch": 0.9131261889663919, "grad_norm": 0.48504592421103015, "learning_rate": 5e-06, "loss": 0.4866, "step": 360 }, { "epoch": 0.9384908053265695, "grad_norm": 0.5794400662308987, "learning_rate": 5e-06, "loss": 0.489, "step": 370 }, { "epoch": 0.963855421686747, "grad_norm": 0.49175786205010735, "learning_rate": 5e-06, "loss": 0.4792, "step": 380 }, { "epoch": 0.9892200380469245, "grad_norm": 0.48088824717550854, "learning_rate": 5e-06, "loss": 0.4793, "step": 390 }, { "epoch": 0.9993658845909955, "eval_loss": 0.48458319902420044, "eval_runtime": 140.5522, "eval_samples_per_second": 75.552, "eval_steps_per_second": 0.591, "step": 394 }, { "epoch": 1.014584654407102, "grad_norm": 0.6241434976553506, "learning_rate": 5e-06, "loss": 0.4668, "step": 400 }, { "epoch": 1.0399492707672797, "grad_norm": 0.5387091155966651, "learning_rate": 5e-06, "loss": 0.4467, "step": 410 }, { "epoch": 1.0653138871274572, "grad_norm": 0.6088667420403366, "learning_rate": 5e-06, "loss": 0.4552, "step": 420 }, { "epoch": 1.0906785034876347, "grad_norm": 0.7635188991702534, "learning_rate": 5e-06, "loss": 0.4569, "step": 430 }, { "epoch": 1.1160431198478122, "grad_norm": 0.5202613636726365, "learning_rate": 5e-06, "loss": 0.4532, "step": 440 }, { "epoch": 1.1414077362079897, "grad_norm": 0.5431289298627378, "learning_rate": 5e-06, "loss": 0.4552, "step": 450 }, { "epoch": 1.1667723525681675, "grad_norm": 0.5447516747773636, "learning_rate": 5e-06, "loss": 0.4517, "step": 460 }, { "epoch": 1.192136968928345, "grad_norm": 0.5811733767557097, "learning_rate": 5e-06, "loss": 0.4596, "step": 470 }, { "epoch": 1.2175015852885225, "grad_norm": 0.5291374404256166, "learning_rate": 5e-06, "loss": 0.4523, "step": 480 }, { "epoch": 1.2428662016487, "grad_norm": 0.920406850160634, "learning_rate": 5e-06, "loss": 0.4512, "step": 490 }, { "epoch": 1.2682308180088775, "grad_norm": 0.5379277068224477, "learning_rate": 5e-06, "loss": 0.4589, "step": 500 }, { "epoch": 1.2935954343690552, "grad_norm": 0.6084288782824112, "learning_rate": 5e-06, "loss": 0.4476, "step": 510 }, { "epoch": 1.3189600507292327, "grad_norm": 0.6373203390142074, "learning_rate": 5e-06, "loss": 0.4508, "step": 520 }, { "epoch": 1.3443246670894102, "grad_norm": 0.5297816500484004, "learning_rate": 5e-06, "loss": 0.4519, "step": 530 }, { "epoch": 1.369689283449588, "grad_norm": 0.5214550304276996, "learning_rate": 5e-06, "loss": 0.4507, "step": 540 }, { "epoch": 1.3950538998097652, "grad_norm": 0.5932937282969459, "learning_rate": 5e-06, "loss": 0.4508, "step": 550 }, { "epoch": 1.420418516169943, "grad_norm": 0.5015573262400715, "learning_rate": 5e-06, "loss": 0.4529, "step": 560 }, { "epoch": 1.4457831325301205, "grad_norm": 0.6541003393290922, "learning_rate": 5e-06, "loss": 0.4487, "step": 570 }, { "epoch": 1.471147748890298, "grad_norm": 0.4738510019221813, "learning_rate": 5e-06, "loss": 0.4437, "step": 580 }, { "epoch": 1.4965123652504757, "grad_norm": 0.5284328908203406, "learning_rate": 5e-06, "loss": 0.449, "step": 590 }, { "epoch": 1.521876981610653, "grad_norm": 0.5814801147707117, "learning_rate": 5e-06, "loss": 0.4498, "step": 600 }, { "epoch": 1.5472415979708307, "grad_norm": 0.7380939259733779, "learning_rate": 5e-06, "loss": 0.4574, "step": 610 }, { "epoch": 1.5726062143310082, "grad_norm": 0.5158189079851289, "learning_rate": 5e-06, "loss": 0.4553, "step": 620 }, { "epoch": 1.5979708306911857, "grad_norm": 0.7517859976181999, "learning_rate": 5e-06, "loss": 0.4479, "step": 630 }, { "epoch": 1.6233354470513635, "grad_norm": 0.4624484508717309, "learning_rate": 5e-06, "loss": 0.4484, "step": 640 }, { "epoch": 1.6487000634115407, "grad_norm": 0.6517886187802472, "learning_rate": 5e-06, "loss": 0.4479, "step": 650 }, { "epoch": 1.6740646797717185, "grad_norm": 0.5168694302612785, "learning_rate": 5e-06, "loss": 0.4498, "step": 660 }, { "epoch": 1.699429296131896, "grad_norm": 0.5442235822761647, "learning_rate": 5e-06, "loss": 0.4546, "step": 670 }, { "epoch": 1.7247939124920735, "grad_norm": 0.5866332538354704, "learning_rate": 5e-06, "loss": 0.4502, "step": 680 }, { "epoch": 1.7501585288522512, "grad_norm": 0.5771993285709256, "learning_rate": 5e-06, "loss": 0.4489, "step": 690 }, { "epoch": 1.7755231452124287, "grad_norm": 0.5856601574541924, "learning_rate": 5e-06, "loss": 0.4516, "step": 700 }, { "epoch": 1.8008877615726062, "grad_norm": 0.5219735572020098, "learning_rate": 5e-06, "loss": 0.4504, "step": 710 }, { "epoch": 1.8262523779327837, "grad_norm": 0.5294326989128105, "learning_rate": 5e-06, "loss": 0.4512, "step": 720 }, { "epoch": 1.8516169942929612, "grad_norm": 0.5043747110843602, "learning_rate": 5e-06, "loss": 0.4476, "step": 730 }, { "epoch": 1.876981610653139, "grad_norm": 0.5243372113736201, "learning_rate": 5e-06, "loss": 0.4487, "step": 740 }, { "epoch": 1.9023462270133165, "grad_norm": 0.46489075414726855, "learning_rate": 5e-06, "loss": 0.4477, "step": 750 }, { "epoch": 1.927710843373494, "grad_norm": 0.47070137502563003, "learning_rate": 5e-06, "loss": 0.4491, "step": 760 }, { "epoch": 1.9530754597336717, "grad_norm": 0.5114250833346574, "learning_rate": 5e-06, "loss": 0.4497, "step": 770 }, { "epoch": 1.978440076093849, "grad_norm": 0.44673587993328173, "learning_rate": 5e-06, "loss": 0.4461, "step": 780 }, { "epoch": 1.9987317691819912, "eval_loss": 0.4722590744495392, "eval_runtime": 136.3111, "eval_samples_per_second": 77.903, "eval_steps_per_second": 0.609, "step": 788 }, { "epoch": 2.0038046924540267, "grad_norm": 0.5253387493826779, "learning_rate": 5e-06, "loss": 0.4406, "step": 790 }, { "epoch": 2.029169308814204, "grad_norm": 0.5649979072148124, "learning_rate": 5e-06, "loss": 0.4174, "step": 800 }, { "epoch": 2.0545339251743817, "grad_norm": 0.554555886277626, "learning_rate": 5e-06, "loss": 0.416, "step": 810 }, { "epoch": 2.0798985415345594, "grad_norm": 0.5380213608538502, "learning_rate": 5e-06, "loss": 0.4075, "step": 820 }, { "epoch": 2.1052631578947367, "grad_norm": 0.6590004861365489, "learning_rate": 5e-06, "loss": 0.415, "step": 830 }, { "epoch": 2.1306277742549145, "grad_norm": 0.5145129946467305, "learning_rate": 5e-06, "loss": 0.4096, "step": 840 }, { "epoch": 2.1559923906150917, "grad_norm": 0.572199886696882, "learning_rate": 5e-06, "loss": 0.4189, "step": 850 }, { "epoch": 2.1813570069752695, "grad_norm": 0.5756593969633285, "learning_rate": 5e-06, "loss": 0.4201, "step": 860 }, { "epoch": 2.206721623335447, "grad_norm": 0.5265898189979799, "learning_rate": 5e-06, "loss": 0.4116, "step": 870 }, { "epoch": 2.2320862396956245, "grad_norm": 0.5424672160350248, "learning_rate": 5e-06, "loss": 0.4099, "step": 880 }, { "epoch": 2.257450856055802, "grad_norm": 0.5674446384978195, "learning_rate": 5e-06, "loss": 0.416, "step": 890 }, { "epoch": 2.2828154724159795, "grad_norm": 0.5128282183689237, "learning_rate": 5e-06, "loss": 0.4157, "step": 900 }, { "epoch": 2.308180088776157, "grad_norm": 0.5135015935006935, "learning_rate": 5e-06, "loss": 0.4172, "step": 910 }, { "epoch": 2.333544705136335, "grad_norm": 0.596189153928778, "learning_rate": 5e-06, "loss": 0.4152, "step": 920 }, { "epoch": 2.3589093214965122, "grad_norm": 0.5352826549369347, "learning_rate": 5e-06, "loss": 0.4149, "step": 930 }, { "epoch": 2.38427393785669, "grad_norm": 0.5014349895803593, "learning_rate": 5e-06, "loss": 0.4141, "step": 940 }, { "epoch": 2.4096385542168672, "grad_norm": 0.4805462505254729, "learning_rate": 5e-06, "loss": 0.4176, "step": 950 }, { "epoch": 2.435003170577045, "grad_norm": 0.6137290218711765, "learning_rate": 5e-06, "loss": 0.419, "step": 960 }, { "epoch": 2.4603677869372227, "grad_norm": 0.5732682054062723, "learning_rate": 5e-06, "loss": 0.4163, "step": 970 }, { "epoch": 2.4857324032974, "grad_norm": 0.4771910554061346, "learning_rate": 5e-06, "loss": 0.4134, "step": 980 }, { "epoch": 2.5110970196575777, "grad_norm": 0.476330897847943, "learning_rate": 5e-06, "loss": 0.4225, "step": 990 }, { "epoch": 2.536461636017755, "grad_norm": 0.47973764991876255, "learning_rate": 5e-06, "loss": 0.4145, "step": 1000 }, { "epoch": 2.5618262523779327, "grad_norm": 0.5939904213084772, "learning_rate": 5e-06, "loss": 0.4153, "step": 1010 }, { "epoch": 2.5871908687381104, "grad_norm": 0.5936679428712734, "learning_rate": 5e-06, "loss": 0.4204, "step": 1020 }, { "epoch": 2.6125554850982877, "grad_norm": 0.5188426106745951, "learning_rate": 5e-06, "loss": 0.4183, "step": 1030 }, { "epoch": 2.6379201014584654, "grad_norm": 0.5644339619977095, "learning_rate": 5e-06, "loss": 0.4126, "step": 1040 }, { "epoch": 2.6632847178186427, "grad_norm": 0.6020266606747191, "learning_rate": 5e-06, "loss": 0.4186, "step": 1050 }, { "epoch": 2.6886493341788205, "grad_norm": 0.4752185053914476, "learning_rate": 5e-06, "loss": 0.4138, "step": 1060 }, { "epoch": 2.714013950538998, "grad_norm": 0.7626568079783347, "learning_rate": 5e-06, "loss": 0.4135, "step": 1070 }, { "epoch": 2.739378566899176, "grad_norm": 0.5108017704950135, "learning_rate": 5e-06, "loss": 0.4154, "step": 1080 }, { "epoch": 2.764743183259353, "grad_norm": 0.5746749293115092, "learning_rate": 5e-06, "loss": 0.4173, "step": 1090 }, { "epoch": 2.7901077996195305, "grad_norm": 0.5467822052037948, "learning_rate": 5e-06, "loss": 0.4166, "step": 1100 }, { "epoch": 2.815472415979708, "grad_norm": 0.6357622704499519, "learning_rate": 5e-06, "loss": 0.4198, "step": 1110 }, { "epoch": 2.840837032339886, "grad_norm": 0.7346508445377833, "learning_rate": 5e-06, "loss": 0.4161, "step": 1120 }, { "epoch": 2.8662016487000637, "grad_norm": 0.4767595766550471, "learning_rate": 5e-06, "loss": 0.4136, "step": 1130 }, { "epoch": 2.891566265060241, "grad_norm": 0.5450967603642648, "learning_rate": 5e-06, "loss": 0.416, "step": 1140 }, { "epoch": 2.9169308814204187, "grad_norm": 0.6310631600995659, "learning_rate": 5e-06, "loss": 0.4156, "step": 1150 }, { "epoch": 2.942295497780596, "grad_norm": 0.4875236135766479, "learning_rate": 5e-06, "loss": 0.4135, "step": 1160 }, { "epoch": 2.9676601141407737, "grad_norm": 0.5024341899279373, "learning_rate": 5e-06, "loss": 0.4185, "step": 1170 }, { "epoch": 2.9930247305009514, "grad_norm": 0.4812185425989623, "learning_rate": 5e-06, "loss": 0.422, "step": 1180 }, { "epoch": 2.9980976537729864, "eval_loss": 0.4713599979877472, "eval_runtime": 132.9609, "eval_samples_per_second": 79.866, "eval_steps_per_second": 0.624, "step": 1182 }, { "epoch": 2.9980976537729864, "step": 1182, "total_flos": 1979475264798720.0, "train_loss": 0.4632013658985067, "train_runtime": 20039.4082, "train_samples_per_second": 30.202, "train_steps_per_second": 0.059 } ], "logging_steps": 10, "max_steps": 1182, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1979475264798720.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }