{ "best_metric": null, "best_model_checkpoint": null, "epoch": 120.0, "eval_steps": 17748, "global_step": 177480, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 12.0, "grad_norm": 0.5154340267181396, "learning_rate": 9.000563443768313e-06, "loss": 0.2626, "step": 17748 }, { "epoch": 12.0, "eval_loss": 6.236159801483154, "eval_runtime": 32.4206, "eval_samples_per_second": 299.655, "eval_steps_per_second": 6.817, "step": 17748 }, { "epoch": 24.0, "grad_norm": 0.006092763505876064, "learning_rate": 8.001014198782963e-06, "loss": 0.057, "step": 35496 }, { "epoch": 24.0, "eval_loss": 6.354074954986572, "eval_runtime": 32.2885, "eval_samples_per_second": 300.881, "eval_steps_per_second": 6.845, "step": 35496 }, { "epoch": 36.0, "grad_norm": 0.002155415015295148, "learning_rate": 7.0013522650439495e-06, "loss": 0.0428, "step": 53244 }, { "epoch": 36.0, "eval_loss": 6.089537620544434, "eval_runtime": 30.7505, "eval_samples_per_second": 315.93, "eval_steps_per_second": 7.187, "step": 53244 }, { "epoch": 48.0, "grad_norm": 0.05033240467309952, "learning_rate": 6.001746675681768e-06, "loss": 0.0359, "step": 70992 }, { "epoch": 48.0, "eval_loss": 6.1070780754089355, "eval_runtime": 30.9274, "eval_samples_per_second": 314.123, "eval_steps_per_second": 7.146, "step": 70992 }, { "epoch": 60.0, "grad_norm": 0.0004564746341202408, "learning_rate": 5.002253775073248e-06, "loss": 0.0312, "step": 88740 }, { "epoch": 60.0, "eval_loss": 5.999448776245117, "eval_runtime": 30.7851, "eval_samples_per_second": 315.575, "eval_steps_per_second": 7.179, "step": 88740 }, { "epoch": 72.0, "grad_norm": 0.007316610310226679, "learning_rate": 4.002704530087897e-06, "loss": 0.0287, "step": 106488 }, { "epoch": 72.0, "eval_loss": 5.854261875152588, "eval_runtime": 30.4575, "eval_samples_per_second": 318.969, "eval_steps_per_second": 7.256, "step": 106488 }, { "epoch": 84.0, "grad_norm": 0.5285100936889648, "learning_rate": 3.0032116294793783e-06, "loss": 0.0262, "step": 124236 }, { "epoch": 84.0, "eval_loss": 5.759538650512695, "eval_runtime": 30.8128, "eval_samples_per_second": 315.291, "eval_steps_per_second": 7.172, "step": 124236 }, { "epoch": 96.0, "grad_norm": 0.014891779981553555, "learning_rate": 2.003718728870859e-06, "loss": 0.0246, "step": 141984 }, { "epoch": 96.0, "eval_loss": 5.716710567474365, "eval_runtime": 30.804, "eval_samples_per_second": 315.381, "eval_steps_per_second": 7.174, "step": 141984 }, { "epoch": 108.0, "grad_norm": 0.26408958435058594, "learning_rate": 1.0041694838855084e-06, "loss": 0.0227, "step": 159732 }, { "epoch": 108.0, "eval_loss": 5.684043884277344, "eval_runtime": 32.4092, "eval_samples_per_second": 299.76, "eval_steps_per_second": 6.819, "step": 159732 }, { "epoch": 120.0, "grad_norm": 0.0010831266408786178, "learning_rate": 4.676583276988957e-09, "loss": 0.0215, "step": 177480 }, { "epoch": 120.0, "eval_loss": 5.624764919281006, "eval_runtime": 32.2837, "eval_samples_per_second": 300.926, "eval_steps_per_second": 6.846, "step": 177480 }, { "epoch": 120.0, "step": 177480, "total_flos": 2.803155588736635e+18, "train_loss": 0.05530680634294621, "train_runtime": 59846.6408, "train_samples_per_second": 118.561, "train_steps_per_second": 2.966 } ], "logging_steps": 17748, "max_steps": 177480, "num_input_tokens_seen": 0, "num_train_epochs": 120, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.803155588736635e+18, "train_batch_size": 40, "trial_name": null, "trial_params": null }