{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 867, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03460207612456748, "grad_norm": 5.743846893310547, "learning_rate": 5e-06, "loss": 0.9289, "step": 10 }, { "epoch": 0.06920415224913495, "grad_norm": 5.331993103027344, "learning_rate": 5e-06, "loss": 0.8126, "step": 20 }, { "epoch": 0.10380622837370242, "grad_norm": 5.425512790679932, "learning_rate": 5e-06, "loss": 0.7742, "step": 30 }, { "epoch": 0.1384083044982699, "grad_norm": 1.6510249376296997, "learning_rate": 5e-06, "loss": 0.747, "step": 40 }, { "epoch": 0.17301038062283736, "grad_norm": 1.0029033422470093, "learning_rate": 5e-06, "loss": 0.7119, "step": 50 }, { "epoch": 0.20761245674740483, "grad_norm": 1.1877987384796143, "learning_rate": 5e-06, "loss": 0.688, "step": 60 }, { "epoch": 0.2422145328719723, "grad_norm": 1.0992727279663086, "learning_rate": 5e-06, "loss": 0.6709, "step": 70 }, { "epoch": 0.2768166089965398, "grad_norm": 1.0378435850143433, "learning_rate": 5e-06, "loss": 0.6633, "step": 80 }, { "epoch": 0.31141868512110726, "grad_norm": 0.7307208776473999, "learning_rate": 5e-06, "loss": 0.659, "step": 90 }, { "epoch": 0.3460207612456747, "grad_norm": 0.8541375994682312, "learning_rate": 5e-06, "loss": 0.6368, "step": 100 }, { "epoch": 0.3806228373702422, "grad_norm": 0.9289709329605103, "learning_rate": 5e-06, "loss": 0.642, "step": 110 }, { "epoch": 0.41522491349480967, "grad_norm": 0.6918668746948242, "learning_rate": 5e-06, "loss": 0.6332, "step": 120 }, { "epoch": 0.44982698961937717, "grad_norm": 0.7631871700286865, "learning_rate": 5e-06, "loss": 0.6317, "step": 130 }, { "epoch": 0.4844290657439446, "grad_norm": 0.6295446157455444, "learning_rate": 5e-06, "loss": 0.6308, "step": 140 }, { "epoch": 0.5190311418685121, "grad_norm": 0.635179340839386, "learning_rate": 5e-06, "loss": 0.6297, "step": 150 }, { "epoch": 0.5536332179930796, "grad_norm": 0.7158755660057068, "learning_rate": 5e-06, "loss": 0.6251, "step": 160 }, { "epoch": 0.5882352941176471, "grad_norm": 0.8344781994819641, "learning_rate": 5e-06, "loss": 0.609, "step": 170 }, { "epoch": 0.6228373702422145, "grad_norm": 0.7887862324714661, "learning_rate": 5e-06, "loss": 0.6186, "step": 180 }, { "epoch": 0.657439446366782, "grad_norm": 0.7533692717552185, "learning_rate": 5e-06, "loss": 0.6201, "step": 190 }, { "epoch": 0.6920415224913494, "grad_norm": 0.7703404426574707, "learning_rate": 5e-06, "loss": 0.6176, "step": 200 }, { "epoch": 0.726643598615917, "grad_norm": 0.6667165160179138, "learning_rate": 5e-06, "loss": 0.612, "step": 210 }, { "epoch": 0.7612456747404844, "grad_norm": 0.7431259155273438, "learning_rate": 5e-06, "loss": 0.6069, "step": 220 }, { "epoch": 0.7958477508650519, "grad_norm": 0.6150693893432617, "learning_rate": 5e-06, "loss": 0.6055, "step": 230 }, { "epoch": 0.8304498269896193, "grad_norm": 1.0219345092773438, "learning_rate": 5e-06, "loss": 0.6068, "step": 240 }, { "epoch": 0.8650519031141869, "grad_norm": 0.5619581341743469, "learning_rate": 5e-06, "loss": 0.6026, "step": 250 }, { "epoch": 0.8996539792387543, "grad_norm": 0.5808368921279907, "learning_rate": 5e-06, "loss": 0.6077, "step": 260 }, { "epoch": 0.9342560553633218, "grad_norm": 0.6060568690299988, "learning_rate": 5e-06, "loss": 0.6031, "step": 270 }, { "epoch": 0.9688581314878892, "grad_norm": 0.515640139579773, "learning_rate": 5e-06, "loss": 0.6051, "step": 280 }, { "epoch": 1.0, "eval_loss": 0.6054237484931946, "eval_runtime": 26.5711, "eval_samples_per_second": 292.988, "eval_steps_per_second": 1.167, "step": 289 }, { "epoch": 1.0034602076124568, "grad_norm": 1.3408666849136353, "learning_rate": 5e-06, "loss": 0.5952, "step": 290 }, { "epoch": 1.0380622837370241, "grad_norm": 1.0180834531784058, "learning_rate": 5e-06, "loss": 0.558, "step": 300 }, { "epoch": 1.0726643598615917, "grad_norm": 0.6628531217575073, "learning_rate": 5e-06, "loss": 0.5671, "step": 310 }, { "epoch": 1.1072664359861593, "grad_norm": 0.6421555876731873, "learning_rate": 5e-06, "loss": 0.5575, "step": 320 }, { "epoch": 1.1418685121107266, "grad_norm": 0.7754614353179932, "learning_rate": 5e-06, "loss": 0.5663, "step": 330 }, { "epoch": 1.1764705882352942, "grad_norm": 0.7297713160514832, "learning_rate": 5e-06, "loss": 0.5553, "step": 340 }, { "epoch": 1.2110726643598615, "grad_norm": 0.7665261030197144, "learning_rate": 5e-06, "loss": 0.5524, "step": 350 }, { "epoch": 1.245674740484429, "grad_norm": 0.7250308394432068, "learning_rate": 5e-06, "loss": 0.5648, "step": 360 }, { "epoch": 1.2802768166089966, "grad_norm": 0.6479423642158508, "learning_rate": 5e-06, "loss": 0.5566, "step": 370 }, { "epoch": 1.314878892733564, "grad_norm": 0.6422529220581055, "learning_rate": 5e-06, "loss": 0.5596, "step": 380 }, { "epoch": 1.3494809688581315, "grad_norm": 0.747588038444519, "learning_rate": 5e-06, "loss": 0.5566, "step": 390 }, { "epoch": 1.3840830449826989, "grad_norm": 0.6396523714065552, "learning_rate": 5e-06, "loss": 0.5582, "step": 400 }, { "epoch": 1.4186851211072664, "grad_norm": 0.6916091442108154, "learning_rate": 5e-06, "loss": 0.5611, "step": 410 }, { "epoch": 1.453287197231834, "grad_norm": 0.5232684016227722, "learning_rate": 5e-06, "loss": 0.5552, "step": 420 }, { "epoch": 1.4878892733564013, "grad_norm": 0.6718825101852417, "learning_rate": 5e-06, "loss": 0.5688, "step": 430 }, { "epoch": 1.5224913494809689, "grad_norm": 0.5741140842437744, "learning_rate": 5e-06, "loss": 0.5525, "step": 440 }, { "epoch": 1.5570934256055362, "grad_norm": 0.643785834312439, "learning_rate": 5e-06, "loss": 0.5617, "step": 450 }, { "epoch": 1.5916955017301038, "grad_norm": 0.7157655954360962, "learning_rate": 5e-06, "loss": 0.5678, "step": 460 }, { "epoch": 1.6262975778546713, "grad_norm": 0.6625480055809021, "learning_rate": 5e-06, "loss": 0.5608, "step": 470 }, { "epoch": 1.6608996539792389, "grad_norm": 0.871629536151886, "learning_rate": 5e-06, "loss": 0.5622, "step": 480 }, { "epoch": 1.6955017301038062, "grad_norm": 0.6766714453697205, "learning_rate": 5e-06, "loss": 0.5498, "step": 490 }, { "epoch": 1.7301038062283736, "grad_norm": 0.5903865694999695, "learning_rate": 5e-06, "loss": 0.5514, "step": 500 }, { "epoch": 1.7647058823529411, "grad_norm": 0.7858901023864746, "learning_rate": 5e-06, "loss": 0.5632, "step": 510 }, { "epoch": 1.7993079584775087, "grad_norm": 0.509254515171051, "learning_rate": 5e-06, "loss": 0.559, "step": 520 }, { "epoch": 1.8339100346020762, "grad_norm": 0.635212779045105, "learning_rate": 5e-06, "loss": 0.5522, "step": 530 }, { "epoch": 1.8685121107266436, "grad_norm": 0.5043730735778809, "learning_rate": 5e-06, "loss": 0.5537, "step": 540 }, { "epoch": 1.903114186851211, "grad_norm": 0.7371939420700073, "learning_rate": 5e-06, "loss": 0.5557, "step": 550 }, { "epoch": 1.9377162629757785, "grad_norm": 0.5097610354423523, "learning_rate": 5e-06, "loss": 0.5481, "step": 560 }, { "epoch": 1.972318339100346, "grad_norm": 0.6075087785720825, "learning_rate": 5e-06, "loss": 0.5546, "step": 570 }, { "epoch": 2.0, "eval_loss": 0.5923461318016052, "eval_runtime": 26.1049, "eval_samples_per_second": 298.22, "eval_steps_per_second": 1.188, "step": 578 }, { "epoch": 2.0069204152249136, "grad_norm": 1.147298812866211, "learning_rate": 5e-06, "loss": 0.5407, "step": 580 }, { "epoch": 2.041522491349481, "grad_norm": 0.7817992568016052, "learning_rate": 5e-06, "loss": 0.51, "step": 590 }, { "epoch": 2.0761245674740483, "grad_norm": 0.6091957092285156, "learning_rate": 5e-06, "loss": 0.5043, "step": 600 }, { "epoch": 2.110726643598616, "grad_norm": 0.734910786151886, "learning_rate": 5e-06, "loss": 0.508, "step": 610 }, { "epoch": 2.1453287197231834, "grad_norm": 0.6779142618179321, "learning_rate": 5e-06, "loss": 0.5109, "step": 620 }, { "epoch": 2.179930795847751, "grad_norm": 0.6702868342399597, "learning_rate": 5e-06, "loss": 0.5151, "step": 630 }, { "epoch": 2.2145328719723185, "grad_norm": 0.6096834540367126, "learning_rate": 5e-06, "loss": 0.5119, "step": 640 }, { "epoch": 2.2491349480968856, "grad_norm": 0.7306134700775146, "learning_rate": 5e-06, "loss": 0.5113, "step": 650 }, { "epoch": 2.283737024221453, "grad_norm": 0.6218770742416382, "learning_rate": 5e-06, "loss": 0.5131, "step": 660 }, { "epoch": 2.3183391003460208, "grad_norm": 0.5716001987457275, "learning_rate": 5e-06, "loss": 0.5115, "step": 670 }, { "epoch": 2.3529411764705883, "grad_norm": 0.5372514724731445, "learning_rate": 5e-06, "loss": 0.5188, "step": 680 }, { "epoch": 2.387543252595156, "grad_norm": 0.5553603172302246, "learning_rate": 5e-06, "loss": 0.5135, "step": 690 }, { "epoch": 2.422145328719723, "grad_norm": 0.6243720054626465, "learning_rate": 5e-06, "loss": 0.5182, "step": 700 }, { "epoch": 2.4567474048442905, "grad_norm": 0.6500486135482788, "learning_rate": 5e-06, "loss": 0.5085, "step": 710 }, { "epoch": 2.491349480968858, "grad_norm": 0.6719836592674255, "learning_rate": 5e-06, "loss": 0.514, "step": 720 }, { "epoch": 2.5259515570934257, "grad_norm": 0.6871230602264404, "learning_rate": 5e-06, "loss": 0.5094, "step": 730 }, { "epoch": 2.5605536332179932, "grad_norm": 0.8423399925231934, "learning_rate": 5e-06, "loss": 0.5138, "step": 740 }, { "epoch": 2.595155709342561, "grad_norm": 0.557617723941803, "learning_rate": 5e-06, "loss": 0.5192, "step": 750 }, { "epoch": 2.629757785467128, "grad_norm": 0.7015785574913025, "learning_rate": 5e-06, "loss": 0.5122, "step": 760 }, { "epoch": 2.6643598615916955, "grad_norm": 0.6643463373184204, "learning_rate": 5e-06, "loss": 0.5136, "step": 770 }, { "epoch": 2.698961937716263, "grad_norm": 0.8234036564826965, "learning_rate": 5e-06, "loss": 0.5147, "step": 780 }, { "epoch": 2.7335640138408306, "grad_norm": 0.5616887211799622, "learning_rate": 5e-06, "loss": 0.5147, "step": 790 }, { "epoch": 2.7681660899653977, "grad_norm": 0.6399916410446167, "learning_rate": 5e-06, "loss": 0.5104, "step": 800 }, { "epoch": 2.8027681660899653, "grad_norm": 0.5685309767723083, "learning_rate": 5e-06, "loss": 0.5128, "step": 810 }, { "epoch": 2.837370242214533, "grad_norm": 0.5425155758857727, "learning_rate": 5e-06, "loss": 0.5132, "step": 820 }, { "epoch": 2.8719723183391004, "grad_norm": 0.8712612986564636, "learning_rate": 5e-06, "loss": 0.5074, "step": 830 }, { "epoch": 2.906574394463668, "grad_norm": 0.8342138528823853, "learning_rate": 5e-06, "loss": 0.5215, "step": 840 }, { "epoch": 2.9411764705882355, "grad_norm": 0.7109184861183167, "learning_rate": 5e-06, "loss": 0.5102, "step": 850 }, { "epoch": 2.9757785467128026, "grad_norm": 0.6473217606544495, "learning_rate": 5e-06, "loss": 0.5168, "step": 860 }, { "epoch": 3.0, "eval_loss": 0.5936245918273926, "eval_runtime": 26.2859, "eval_samples_per_second": 296.166, "eval_steps_per_second": 1.179, "step": 867 }, { "epoch": 3.0, "step": 867, "total_flos": 4.093705665380352e+19, "train_loss": 0.5755822655788893, "train_runtime": 6372.9887, "train_samples_per_second": 69.622, "train_steps_per_second": 0.136 } ], "logging_steps": 10, "max_steps": 867, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.093705665380352e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }