{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 5463, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018304960644334616, "grad_norm": 0.11226867139339447, "learning_rate": 2.0000000000000003e-06, "loss": 2.4178, "step": 100 }, { "epoch": 0.03660992128866923, "grad_norm": 0.19307924807071686, "learning_rate": 4.000000000000001e-06, "loss": 2.4311, "step": 200 }, { "epoch": 0.054914881933003847, "grad_norm": 0.1868831366300583, "learning_rate": 6e-06, "loss": 2.4251, "step": 300 }, { "epoch": 0.07321984257733846, "grad_norm": 0.2497679591178894, "learning_rate": 8.000000000000001e-06, "loss": 2.4004, "step": 400 }, { "epoch": 0.09152480322167307, "grad_norm": 0.3507976830005646, "learning_rate": 1e-05, "loss": 2.3282, "step": 500 }, { "epoch": 0.10982976386600769, "grad_norm": 0.40492674708366394, "learning_rate": 1.2e-05, "loss": 2.2985, "step": 600 }, { "epoch": 0.1281347245103423, "grad_norm": 0.449167400598526, "learning_rate": 1.4e-05, "loss": 2.2815, "step": 700 }, { "epoch": 0.14643968515467692, "grad_norm": 0.510024905204773, "learning_rate": 1.6000000000000003e-05, "loss": 2.2252, "step": 800 }, { "epoch": 0.16474464579901152, "grad_norm": 0.5523568987846375, "learning_rate": 1.8e-05, "loss": 2.222, "step": 900 }, { "epoch": 0.18304960644334614, "grad_norm": 0.6460111141204834, "learning_rate": 2e-05, "loss": 2.2266, "step": 1000 }, { "epoch": 0.20135456708768076, "grad_norm": 0.6135373711585999, "learning_rate": 1.9975235096487343e-05, "loss": 2.1631, "step": 1100 }, { "epoch": 0.21965952773201539, "grad_norm": 0.750421941280365, "learning_rate": 1.990106304603857e-05, "loss": 2.1781, "step": 1200 }, { "epoch": 0.23796448837634998, "grad_norm": 0.6863061189651489, "learning_rate": 1.9777851221388214e-05, "loss": 2.1576, "step": 1300 }, { "epoch": 0.2562694490206846, "grad_norm": 0.592878520488739, "learning_rate": 1.9606209888326098e-05, "loss": 2.142, "step": 1400 }, { "epoch": 0.2745744096650192, "grad_norm": 0.9680142998695374, "learning_rate": 1.9386989183062637e-05, "loss": 2.1088, "step": 1500 }, { "epoch": 0.29287937030935385, "grad_norm": 0.8196300864219666, "learning_rate": 1.9121274901520593e-05, "loss": 2.0937, "step": 1600 }, { "epoch": 0.31118433095368847, "grad_norm": 0.935396671295166, "learning_rate": 1.881038312140883e-05, "loss": 2.0938, "step": 1700 }, { "epoch": 0.32948929159802304, "grad_norm": 0.7880417704582214, "learning_rate": 1.8455853683714823e-05, "loss": 2.0672, "step": 1800 }, { "epoch": 0.34779425224235766, "grad_norm": 0.8693708777427673, "learning_rate": 1.805944256590194e-05, "loss": 2.0529, "step": 1900 }, { "epoch": 0.3660992128866923, "grad_norm": 0.7159385681152344, "learning_rate": 1.7623113184586985e-05, "loss": 2.0298, "step": 2000 }, { "epoch": 0.3844041735310269, "grad_norm": 0.7743539214134216, "learning_rate": 1.7149026670775558e-05, "loss": 2.0681, "step": 2100 }, { "epoch": 0.4027091341753615, "grad_norm": 0.892536997795105, "learning_rate": 1.6639531165821896e-05, "loss": 2.0808, "step": 2200 }, { "epoch": 0.42101409481969615, "grad_norm": 1.1176153421401978, "learning_rate": 1.6097150191130056e-05, "loss": 2.0381, "step": 2300 }, { "epoch": 0.43931905546403077, "grad_norm": 1.0423996448516846, "learning_rate": 1.5524570149201115e-05, "loss": 2.0246, "step": 2400 }, { "epoch": 0.4576240161083654, "grad_norm": 0.8842440843582153, "learning_rate": 1.4924627017933402e-05, "loss": 2.0534, "step": 2500 }, { "epoch": 0.47592897675269996, "grad_norm": 0.8251581192016602, "learning_rate": 1.4300292304078696e-05, "loss": 2.0748, "step": 2600 }, { "epoch": 0.4942339373970346, "grad_norm": 0.9325842261314392, "learning_rate": 1.3654658325426641e-05, "loss": 2.0505, "step": 2700 }, { "epoch": 0.5125388980413692, "grad_norm": 0.882586658000946, "learning_rate": 1.2990922894614404e-05, "loss": 2.0338, "step": 2800 }, { "epoch": 0.5308438586857038, "grad_norm": 1.3278307914733887, "learning_rate": 1.2312373480422384e-05, "loss": 2.0244, "step": 2900 }, { "epoch": 0.5491488193300385, "grad_norm": 1.3495004177093506, "learning_rate": 1.1622370925004782e-05, "loss": 2.0226, "step": 3000 }, { "epoch": 0.5674537799743731, "grad_norm": 0.730983316898346, "learning_rate": 1.0924332797703284e-05, "loss": 1.9642, "step": 3100 }, { "epoch": 0.5857587406187077, "grad_norm": 1.0524176359176636, "learning_rate": 1.0221716467892045e-05, "loss": 2.0393, "step": 3200 }, { "epoch": 0.6040637012630423, "grad_norm": 0.8124818205833435, "learning_rate": 9.518001980693905e-06, "loss": 2.0066, "step": 3300 }, { "epoch": 0.6223686619073769, "grad_norm": 0.9999445676803589, "learning_rate": 8.816674820384044e-06, "loss": 2.0355, "step": 3400 }, { "epoch": 0.6406736225517116, "grad_norm": 1.3435726165771484, "learning_rate": 8.121208646853637e-06, "loss": 2.0393, "step": 3500 }, { "epoch": 0.6589785831960461, "grad_norm": 1.0357476472854614, "learning_rate": 7.435048090639456e-06, "loss": 2.0351, "step": 3600 }, { "epoch": 0.6772835438403807, "grad_norm": 1.3906038999557495, "learning_rate": 6.7615916917352545e-06, "loss": 2.0292, "step": 3700 }, { "epoch": 0.6955885044847153, "grad_norm": 1.0058213472366333, "learning_rate": 6.104175066688805e-06, "loss": 2.0046, "step": 3800 }, { "epoch": 0.7138934651290499, "grad_norm": 0.9746403694152832, "learning_rate": 5.466054387357491e-06, "loss": 2.0278, "step": 3900 }, { "epoch": 0.7321984257733846, "grad_norm": 0.7524260878562927, "learning_rate": 4.8503902531519224e-06, "loss": 2.026, "step": 4000 }, { "epoch": 0.7505033864177192, "grad_norm": 0.848231852054596, "learning_rate": 4.260232036648057e-06, "loss": 1.994, "step": 4100 }, { "epoch": 0.7688083470620538, "grad_norm": 0.9133473634719849, "learning_rate": 3.6985027801036876e-06, "loss": 2.0077, "step": 4200 }, { "epoch": 0.7871133077063884, "grad_norm": 1.1169400215148926, "learning_rate": 3.167984717686521e-06, "loss": 2.0371, "step": 4300 }, { "epoch": 0.805418268350723, "grad_norm": 1.1980133056640625, "learning_rate": 2.6713054951220497e-06, "loss": 2.0253, "step": 4400 }, { "epoch": 0.8237232289950577, "grad_norm": 0.8313106894493103, "learning_rate": 2.2109251550149922e-06, "loss": 1.9908, "step": 4500 }, { "epoch": 0.8420281896393923, "grad_norm": 0.9666856527328491, "learning_rate": 1.7891239523057202e-06, "loss": 1.9913, "step": 4600 }, { "epoch": 0.8603331502837269, "grad_norm": 0.9010581970214844, "learning_rate": 1.4079910602115544e-06, "loss": 1.9869, "step": 4700 }, { "epoch": 0.8786381109280615, "grad_norm": 1.0931150913238525, "learning_rate": 1.0694142225921444e-06, "loss": 1.9533, "step": 4800 }, { "epoch": 0.8969430715723962, "grad_norm": 0.984924852848053, "learning_rate": 7.750704039905343e-07, "loss": 2.0316, "step": 4900 }, { "epoch": 0.9152480322167308, "grad_norm": 0.9124763011932373, "learning_rate": 5.264174836601732e-07, "loss": 2.0208, "step": 5000 }, { "epoch": 0.9335529928610653, "grad_norm": 1.179923176765442, "learning_rate": 3.24687034717085e-07, "loss": 2.0175, "step": 5100 }, { "epoch": 0.9518579535053999, "grad_norm": 1.0648012161254883, "learning_rate": 1.7087822418199506e-07, "loss": 2.0134, "step": 5200 }, { "epoch": 0.9701629141497345, "grad_norm": 0.8695976734161377, "learning_rate": 6.575286412536686e-08, "loss": 2.0041, "step": 5300 }, { "epoch": 0.9884678747940692, "grad_norm": 1.045597791671753, "learning_rate": 9.831638426904821e-09, "loss": 2.0033, "step": 5400 }, { "epoch": 1.0, "step": 5463, "total_flos": 9.9498061824e+16, "train_loss": 2.0932261693746033, "train_runtime": 883.1108, "train_samples_per_second": 12.371, "train_steps_per_second": 6.186 } ], "logging_steps": 100, "max_steps": 5463, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.9498061824e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }