{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.032520325203252, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1, "grad_norm": 0.7537718524378184, "learning_rate": 4.998825837977733e-05, "loss": 1.0335, "step": 25 }, { "epoch": 0.2, "grad_norm": 0.5115893351462029, "learning_rate": 4.9951068336359185e-05, "loss": 0.9543, "step": 50 }, { "epoch": 0.3, "grad_norm": 0.3655365544393326, "learning_rate": 4.9888447388643216e-05, "loss": 0.889, "step": 75 }, { "epoch": 0.41, "grad_norm": 0.3802482724658219, "learning_rate": 4.980045936184552e-05, "loss": 0.8824, "step": 100 }, { "epoch": 0.51, "grad_norm": 0.46128857579583404, "learning_rate": 4.968719393609757e-05, "loss": 0.8812, "step": 125 }, { "epoch": 0.61, "grad_norm": 0.4675840689300933, "learning_rate": 4.954876655504144e-05, "loss": 0.8626, "step": 150 }, { "epoch": 0.71, "grad_norm": 0.5174033092078555, "learning_rate": 4.938531830816607e-05, "loss": 0.8542, "step": 175 }, { "epoch": 0.81, "grad_norm": 0.47966453174679635, "learning_rate": 4.919701578700444e-05, "loss": 0.8615, "step": 200 }, { "epoch": 0.91, "grad_norm": 0.5800019356792034, "learning_rate": 4.898405091533834e-05, "loss": 0.8198, "step": 225 }, { "epoch": 1.02, "grad_norm": 0.5068384935929343, "learning_rate": 4.874664075358366e-05, "loss": 0.835, "step": 250 }, { "epoch": 1.12, "grad_norm": 0.5665554500957887, "learning_rate": 4.84850272775557e-05, "loss": 0.833, "step": 275 }, { "epoch": 1.22, "grad_norm": 0.6225574393610873, "learning_rate": 4.8199477131839854e-05, "loss": 0.8362, "step": 300 }, { "epoch": 1.32, "grad_norm": 0.5883987854013639, "learning_rate": 4.789028135801918e-05, "loss": 0.8315, "step": 325 }, { "epoch": 1.42, "grad_norm": 0.6212622090526995, "learning_rate": 4.7557755098035814e-05, "loss": 0.8082, "step": 350 }, { "epoch": 1.52, "grad_norm": 0.6254380356435723, "learning_rate": 4.720223727298845e-05, "loss": 0.8112, "step": 375 }, { "epoch": 1.63, "grad_norm": 0.7114667768707209, "learning_rate": 4.682409023769342e-05, "loss": 0.8141, "step": 400 }, { "epoch": 1.73, "grad_norm": 0.7156140969579615, "learning_rate": 4.6423699411361474e-05, "loss": 0.8214, "step": 425 }, { "epoch": 1.83, "grad_norm": 0.6560300477797654, "learning_rate": 4.600147288476647e-05, "loss": 0.819, "step": 450 }, { "epoch": 1.93, "grad_norm": 0.6220749749772762, "learning_rate": 4.5557841004306625e-05, "loss": 0.8177, "step": 475 }, { "epoch": 2.03, "grad_norm": 0.7459915153227248, "learning_rate": 4.509325593338203e-05, "loss": 0.8207, "step": 500 } ], "logging_steps": 25, "max_steps": 2460, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 152390335463424.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }