{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9981447124304266, "eval_steps": 500, "global_step": 1212, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.024737167594310452, "grad_norm": 3.4830790839772257, "learning_rate": 5e-06, "loss": 0.8916, "step": 10 }, { "epoch": 0.049474335188620905, "grad_norm": 6.942725060810088, "learning_rate": 5e-06, "loss": 0.7677, "step": 20 }, { "epoch": 0.07421150278293136, "grad_norm": 1.8951062871081399, "learning_rate": 5e-06, "loss": 0.7288, "step": 30 }, { "epoch": 0.09894867037724181, "grad_norm": 1.9357008010716068, "learning_rate": 5e-06, "loss": 0.7031, "step": 40 }, { "epoch": 0.12368583797155226, "grad_norm": 0.9499312221395673, "learning_rate": 5e-06, "loss": 0.6844, "step": 50 }, { "epoch": 0.14842300556586271, "grad_norm": 0.7455987403828673, "learning_rate": 5e-06, "loss": 0.6598, "step": 60 }, { "epoch": 0.17316017316017315, "grad_norm": 0.6470160824911549, "learning_rate": 5e-06, "loss": 0.6485, "step": 70 }, { "epoch": 0.19789734075448362, "grad_norm": 0.6635217517468123, "learning_rate": 5e-06, "loss": 0.6398, "step": 80 }, { "epoch": 0.22263450834879406, "grad_norm": 0.5563048334006576, "learning_rate": 5e-06, "loss": 0.6378, "step": 90 }, { "epoch": 0.24737167594310452, "grad_norm": 0.685513800699662, "learning_rate": 5e-06, "loss": 0.6307, "step": 100 }, { "epoch": 0.272108843537415, "grad_norm": 0.8408848874695213, "learning_rate": 5e-06, "loss": 0.6176, "step": 110 }, { "epoch": 0.29684601113172543, "grad_norm": 0.6773751764933612, "learning_rate": 5e-06, "loss": 0.6178, "step": 120 }, { "epoch": 0.32158317872603587, "grad_norm": 0.7027587036277353, "learning_rate": 5e-06, "loss": 0.6133, "step": 130 }, { "epoch": 0.3463203463203463, "grad_norm": 0.6157245730205899, "learning_rate": 5e-06, "loss": 0.6092, "step": 140 }, { "epoch": 0.37105751391465674, "grad_norm": 0.5802260294344969, "learning_rate": 5e-06, "loss": 0.6046, "step": 150 }, { "epoch": 0.39579468150896724, "grad_norm": 0.6344807037013861, "learning_rate": 5e-06, "loss": 0.6009, "step": 160 }, { "epoch": 0.4205318491032777, "grad_norm": 0.48480545471576164, "learning_rate": 5e-06, "loss": 0.6082, "step": 170 }, { "epoch": 0.4452690166975881, "grad_norm": 0.6380631405566793, "learning_rate": 5e-06, "loss": 0.6004, "step": 180 }, { "epoch": 0.47000618429189855, "grad_norm": 0.5505169198792789, "learning_rate": 5e-06, "loss": 0.5994, "step": 190 }, { "epoch": 0.49474335188620905, "grad_norm": 0.6242896844382176, "learning_rate": 5e-06, "loss": 0.5985, "step": 200 }, { "epoch": 0.5194805194805194, "grad_norm": 0.5255950166858494, "learning_rate": 5e-06, "loss": 0.6022, "step": 210 }, { "epoch": 0.54421768707483, "grad_norm": 0.5516723643293726, "learning_rate": 5e-06, "loss": 0.5927, "step": 220 }, { "epoch": 0.5689548546691404, "grad_norm": 0.584042711614131, "learning_rate": 5e-06, "loss": 0.5925, "step": 230 }, { "epoch": 0.5936920222634509, "grad_norm": 0.6260676537261808, "learning_rate": 5e-06, "loss": 0.5928, "step": 240 }, { "epoch": 0.6184291898577613, "grad_norm": 0.5176619963131881, "learning_rate": 5e-06, "loss": 0.5858, "step": 250 }, { "epoch": 0.6431663574520717, "grad_norm": 0.5024236252235582, "learning_rate": 5e-06, "loss": 0.5879, "step": 260 }, { "epoch": 0.6679035250463822, "grad_norm": 0.6803247050243845, "learning_rate": 5e-06, "loss": 0.5844, "step": 270 }, { "epoch": 0.6926406926406926, "grad_norm": 0.6015791834486387, "learning_rate": 5e-06, "loss": 0.5755, "step": 280 }, { "epoch": 0.717377860235003, "grad_norm": 0.5299909658526826, "learning_rate": 5e-06, "loss": 0.5843, "step": 290 }, { "epoch": 0.7421150278293135, "grad_norm": 0.5088932556423367, "learning_rate": 5e-06, "loss": 0.5864, "step": 300 }, { "epoch": 0.766852195423624, "grad_norm": 0.5964832923195739, "learning_rate": 5e-06, "loss": 0.5822, "step": 310 }, { "epoch": 0.7915893630179345, "grad_norm": 0.5860489284099033, "learning_rate": 5e-06, "loss": 0.5727, "step": 320 }, { "epoch": 0.8163265306122449, "grad_norm": 0.5219580208519626, "learning_rate": 5e-06, "loss": 0.5794, "step": 330 }, { "epoch": 0.8410636982065554, "grad_norm": 0.5269259293045121, "learning_rate": 5e-06, "loss": 0.5793, "step": 340 }, { "epoch": 0.8658008658008658, "grad_norm": 0.5291953882291388, "learning_rate": 5e-06, "loss": 0.5744, "step": 350 }, { "epoch": 0.8905380333951762, "grad_norm": 0.521358302515632, "learning_rate": 5e-06, "loss": 0.5707, "step": 360 }, { "epoch": 0.9152752009894867, "grad_norm": 0.5774675957700983, "learning_rate": 5e-06, "loss": 0.5765, "step": 370 }, { "epoch": 0.9400123685837971, "grad_norm": 0.5172232609124243, "learning_rate": 5e-06, "loss": 0.5714, "step": 380 }, { "epoch": 0.9647495361781077, "grad_norm": 0.5240273423167071, "learning_rate": 5e-06, "loss": 0.5682, "step": 390 }, { "epoch": 0.9894867037724181, "grad_norm": 0.6149272559928857, "learning_rate": 5e-06, "loss": 0.5719, "step": 400 }, { "epoch": 0.9993815708101422, "eval_loss": 0.5683358907699585, "eval_runtime": 219.0047, "eval_samples_per_second": 49.734, "eval_steps_per_second": 0.393, "step": 404 }, { "epoch": 1.0142238713667284, "grad_norm": 0.5073078170608074, "learning_rate": 5e-06, "loss": 0.547, "step": 410 }, { "epoch": 1.0389610389610389, "grad_norm": 0.6437533127258526, "learning_rate": 5e-06, "loss": 0.5316, "step": 420 }, { "epoch": 1.0636982065553493, "grad_norm": 0.6100879209869073, "learning_rate": 5e-06, "loss": 0.5301, "step": 430 }, { "epoch": 1.08843537414966, "grad_norm": 0.48108127450705374, "learning_rate": 5e-06, "loss": 0.5294, "step": 440 }, { "epoch": 1.1131725417439704, "grad_norm": 0.5868928526612365, "learning_rate": 5e-06, "loss": 0.5332, "step": 450 }, { "epoch": 1.1379097093382808, "grad_norm": 0.6202895625151273, "learning_rate": 5e-06, "loss": 0.535, "step": 460 }, { "epoch": 1.1626468769325913, "grad_norm": 0.5075954994852542, "learning_rate": 5e-06, "loss": 0.5326, "step": 470 }, { "epoch": 1.1873840445269017, "grad_norm": 0.4728741075680983, "learning_rate": 5e-06, "loss": 0.532, "step": 480 }, { "epoch": 1.2121212121212122, "grad_norm": 0.5081500756875807, "learning_rate": 5e-06, "loss": 0.5348, "step": 490 }, { "epoch": 1.2368583797155226, "grad_norm": 0.6910919006687529, "learning_rate": 5e-06, "loss": 0.5197, "step": 500 }, { "epoch": 1.261595547309833, "grad_norm": 0.5501274010525514, "learning_rate": 5e-06, "loss": 0.5222, "step": 510 }, { "epoch": 1.2863327149041435, "grad_norm": 0.7316007600357546, "learning_rate": 5e-06, "loss": 0.5335, "step": 520 }, { "epoch": 1.311069882498454, "grad_norm": 0.5057200127850379, "learning_rate": 5e-06, "loss": 0.5288, "step": 530 }, { "epoch": 1.3358070500927643, "grad_norm": 0.5719065403986006, "learning_rate": 5e-06, "loss": 0.529, "step": 540 }, { "epoch": 1.3605442176870748, "grad_norm": 0.5660341689340015, "learning_rate": 5e-06, "loss": 0.5282, "step": 550 }, { "epoch": 1.3852813852813852, "grad_norm": 0.7015838028229988, "learning_rate": 5e-06, "loss": 0.5296, "step": 560 }, { "epoch": 1.4100185528756957, "grad_norm": 0.5640160639184895, "learning_rate": 5e-06, "loss": 0.531, "step": 570 }, { "epoch": 1.434755720470006, "grad_norm": 0.5241187267275058, "learning_rate": 5e-06, "loss": 0.5277, "step": 580 }, { "epoch": 1.4594928880643168, "grad_norm": 0.5322661450321486, "learning_rate": 5e-06, "loss": 0.5223, "step": 590 }, { "epoch": 1.4842300556586272, "grad_norm": 0.5414175662054105, "learning_rate": 5e-06, "loss": 0.5253, "step": 600 }, { "epoch": 1.5089672232529376, "grad_norm": 0.5120018621643256, "learning_rate": 5e-06, "loss": 0.5171, "step": 610 }, { "epoch": 1.533704390847248, "grad_norm": 0.5872164754716582, "learning_rate": 5e-06, "loss": 0.5263, "step": 620 }, { "epoch": 1.5584415584415585, "grad_norm": 0.4661784659324605, "learning_rate": 5e-06, "loss": 0.5217, "step": 630 }, { "epoch": 1.583178726035869, "grad_norm": 0.464230352567236, "learning_rate": 5e-06, "loss": 0.5204, "step": 640 }, { "epoch": 1.6079158936301794, "grad_norm": 0.5032827937357744, "learning_rate": 5e-06, "loss": 0.5216, "step": 650 }, { "epoch": 1.6326530612244898, "grad_norm": 0.4996792160145028, "learning_rate": 5e-06, "loss": 0.5218, "step": 660 }, { "epoch": 1.6573902288188003, "grad_norm": 0.5271632514608294, "learning_rate": 5e-06, "loss": 0.5226, "step": 670 }, { "epoch": 1.6821273964131107, "grad_norm": 0.49347623420693476, "learning_rate": 5e-06, "loss": 0.5193, "step": 680 }, { "epoch": 1.7068645640074211, "grad_norm": 0.6049309413421875, "learning_rate": 5e-06, "loss": 0.5247, "step": 690 }, { "epoch": 1.7316017316017316, "grad_norm": 0.5973360853484692, "learning_rate": 5e-06, "loss": 0.516, "step": 700 }, { "epoch": 1.756338899196042, "grad_norm": 0.49072766464621603, "learning_rate": 5e-06, "loss": 0.5179, "step": 710 }, { "epoch": 1.7810760667903525, "grad_norm": 0.5091642966772914, "learning_rate": 5e-06, "loss": 0.5181, "step": 720 }, { "epoch": 1.805813234384663, "grad_norm": 0.5801588473122157, "learning_rate": 5e-06, "loss": 0.5177, "step": 730 }, { "epoch": 1.8305504019789733, "grad_norm": 0.48516680968832, "learning_rate": 5e-06, "loss": 0.5231, "step": 740 }, { "epoch": 1.8552875695732838, "grad_norm": 0.46039086459013556, "learning_rate": 5e-06, "loss": 0.5249, "step": 750 }, { "epoch": 1.8800247371675942, "grad_norm": 0.5083902833654592, "learning_rate": 5e-06, "loss": 0.518, "step": 760 }, { "epoch": 1.9047619047619047, "grad_norm": 0.5109759169478353, "learning_rate": 5e-06, "loss": 0.5177, "step": 770 }, { "epoch": 1.929499072356215, "grad_norm": 0.4767006976535441, "learning_rate": 5e-06, "loss": 0.5132, "step": 780 }, { "epoch": 1.9542362399505255, "grad_norm": 0.5035389849175771, "learning_rate": 5e-06, "loss": 0.5181, "step": 790 }, { "epoch": 1.978973407544836, "grad_norm": 0.5072097192727141, "learning_rate": 5e-06, "loss": 0.5166, "step": 800 }, { "epoch": 1.9987631416202845, "eval_loss": 0.5430302023887634, "eval_runtime": 219.3095, "eval_samples_per_second": 49.665, "eval_steps_per_second": 0.392, "step": 808 }, { "epoch": 2.0037105751391464, "grad_norm": 0.5742594976510665, "learning_rate": 5e-06, "loss": 0.5105, "step": 810 }, { "epoch": 2.028447742733457, "grad_norm": 0.5679263759068595, "learning_rate": 5e-06, "loss": 0.4801, "step": 820 }, { "epoch": 2.0531849103277673, "grad_norm": 0.5672377634558515, "learning_rate": 5e-06, "loss": 0.4818, "step": 830 }, { "epoch": 2.0779220779220777, "grad_norm": 0.6793815683511014, "learning_rate": 5e-06, "loss": 0.4756, "step": 840 }, { "epoch": 2.102659245516388, "grad_norm": 0.5158458141043201, "learning_rate": 5e-06, "loss": 0.4768, "step": 850 }, { "epoch": 2.1273964131106986, "grad_norm": 0.6722479592255648, "learning_rate": 5e-06, "loss": 0.481, "step": 860 }, { "epoch": 2.1521335807050095, "grad_norm": 0.622090680752648, "learning_rate": 5e-06, "loss": 0.4806, "step": 870 }, { "epoch": 2.17687074829932, "grad_norm": 0.5446538279562969, "learning_rate": 5e-06, "loss": 0.4792, "step": 880 }, { "epoch": 2.2016079158936304, "grad_norm": 0.9897157820333273, "learning_rate": 5e-06, "loss": 0.4743, "step": 890 }, { "epoch": 2.226345083487941, "grad_norm": 0.5709797478235871, "learning_rate": 5e-06, "loss": 0.4745, "step": 900 }, { "epoch": 2.2510822510822512, "grad_norm": 0.5508207898306552, "learning_rate": 5e-06, "loss": 0.4825, "step": 910 }, { "epoch": 2.2758194186765617, "grad_norm": 0.4968712039591529, "learning_rate": 5e-06, "loss": 0.4805, "step": 920 }, { "epoch": 2.300556586270872, "grad_norm": 0.5588550182736718, "learning_rate": 5e-06, "loss": 0.4816, "step": 930 }, { "epoch": 2.3252937538651826, "grad_norm": 0.6560301930501052, "learning_rate": 5e-06, "loss": 0.4834, "step": 940 }, { "epoch": 2.350030921459493, "grad_norm": 0.47583877726868273, "learning_rate": 5e-06, "loss": 0.4785, "step": 950 }, { "epoch": 2.3747680890538034, "grad_norm": 0.5212062104260379, "learning_rate": 5e-06, "loss": 0.4787, "step": 960 }, { "epoch": 2.399505256648114, "grad_norm": 0.47451320868129626, "learning_rate": 5e-06, "loss": 0.4829, "step": 970 }, { "epoch": 2.4242424242424243, "grad_norm": 0.4830738101162483, "learning_rate": 5e-06, "loss": 0.4843, "step": 980 }, { "epoch": 2.4489795918367347, "grad_norm": 0.5182106761363315, "learning_rate": 5e-06, "loss": 0.4757, "step": 990 }, { "epoch": 2.473716759431045, "grad_norm": 0.5153536365130167, "learning_rate": 5e-06, "loss": 0.4768, "step": 1000 }, { "epoch": 2.4984539270253556, "grad_norm": 0.5417873882437457, "learning_rate": 5e-06, "loss": 0.4782, "step": 1010 }, { "epoch": 2.523191094619666, "grad_norm": 0.5527273801359924, "learning_rate": 5e-06, "loss": 0.475, "step": 1020 }, { "epoch": 2.5479282622139765, "grad_norm": 0.5335187564602286, "learning_rate": 5e-06, "loss": 0.4792, "step": 1030 }, { "epoch": 2.572665429808287, "grad_norm": 0.5000885945895702, "learning_rate": 5e-06, "loss": 0.4769, "step": 1040 }, { "epoch": 2.5974025974025974, "grad_norm": 0.518335000088379, "learning_rate": 5e-06, "loss": 0.4771, "step": 1050 }, { "epoch": 2.622139764996908, "grad_norm": 0.5506780400189991, "learning_rate": 5e-06, "loss": 0.4813, "step": 1060 }, { "epoch": 2.6468769325912183, "grad_norm": 0.5009935325893752, "learning_rate": 5e-06, "loss": 0.479, "step": 1070 }, { "epoch": 2.6716141001855287, "grad_norm": 0.5265944850039301, "learning_rate": 5e-06, "loss": 0.4808, "step": 1080 }, { "epoch": 2.696351267779839, "grad_norm": 0.5426869881704958, "learning_rate": 5e-06, "loss": 0.4798, "step": 1090 }, { "epoch": 2.7210884353741496, "grad_norm": 0.6372802273548948, "learning_rate": 5e-06, "loss": 0.4824, "step": 1100 }, { "epoch": 2.74582560296846, "grad_norm": 0.5506322753225094, "learning_rate": 5e-06, "loss": 0.4748, "step": 1110 }, { "epoch": 2.7705627705627704, "grad_norm": 0.6432775550069307, "learning_rate": 5e-06, "loss": 0.4785, "step": 1120 }, { "epoch": 2.795299938157081, "grad_norm": 0.5422596013158468, "learning_rate": 5e-06, "loss": 0.4756, "step": 1130 }, { "epoch": 2.8200371057513913, "grad_norm": 0.5766402488022242, "learning_rate": 5e-06, "loss": 0.4813, "step": 1140 }, { "epoch": 2.8447742733457018, "grad_norm": 0.6098697051701109, "learning_rate": 5e-06, "loss": 0.4813, "step": 1150 }, { "epoch": 2.869511440940012, "grad_norm": 0.6756833035066511, "learning_rate": 5e-06, "loss": 0.4774, "step": 1160 }, { "epoch": 2.8942486085343226, "grad_norm": 0.6408060819703033, "learning_rate": 5e-06, "loss": 0.4762, "step": 1170 }, { "epoch": 2.9189857761286335, "grad_norm": 0.5130121997574635, "learning_rate": 5e-06, "loss": 0.4764, "step": 1180 }, { "epoch": 2.9437229437229435, "grad_norm": 0.5162312463500585, "learning_rate": 5e-06, "loss": 0.4777, "step": 1190 }, { "epoch": 2.9684601113172544, "grad_norm": 0.5924512448077986, "learning_rate": 5e-06, "loss": 0.4791, "step": 1200 }, { "epoch": 2.9931972789115644, "grad_norm": 0.502845968511808, "learning_rate": 5e-06, "loss": 0.4761, "step": 1210 }, { "epoch": 2.9981447124304266, "eval_loss": 0.537095308303833, "eval_runtime": 219.5171, "eval_samples_per_second": 49.618, "eval_steps_per_second": 0.392, "step": 1212 }, { "epoch": 2.9981447124304266, "step": 1212, "total_flos": 2029726382161920.0, "train_loss": 0.5400018749063952, "train_runtime": 36478.2162, "train_samples_per_second": 17.019, "train_steps_per_second": 0.033 } ], "logging_steps": 10, "max_steps": 1212, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2029726382161920.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }