{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5070993914807302, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02535496957403651, "grad_norm": 8.872879028320312, "learning_rate": 4.166666666666667e-05, "loss": 2.5906, "step": 25 }, { "epoch": 0.05070993914807302, "grad_norm": 10.428619384765625, "learning_rate": 4.994602438146344e-05, "loss": 2.704, "step": 50 }, { "epoch": 0.07606490872210954, "grad_norm": 8.182428359985352, "learning_rate": 4.972714782003472e-05, "loss": 2.8506, "step": 75 }, { "epoch": 0.10141987829614604, "grad_norm": 6.291181564331055, "learning_rate": 4.934147215158731e-05, "loss": 3.0837, "step": 100 }, { "epoch": 0.12677484787018256, "grad_norm": 5.771434307098389, "learning_rate": 4.87915989845867e-05, "loss": 2.7152, "step": 125 }, { "epoch": 0.15212981744421908, "grad_norm": 5.961496353149414, "learning_rate": 4.8081237535878116e-05, "loss": 2.884, "step": 150 }, { "epoch": 0.17748478701825557, "grad_norm": 6.111754894256592, "learning_rate": 4.7215179609844665e-05, "loss": 2.8407, "step": 175 }, { "epoch": 0.2028397565922921, "grad_norm": 6.186964988708496, "learning_rate": 4.6199267274877736e-05, "loss": 2.7641, "step": 200 }, { "epoch": 0.2281947261663286, "grad_norm": 6.564132213592529, "learning_rate": 4.504035345520115e-05, "loss": 2.6484, "step": 225 }, { "epoch": 0.2535496957403651, "grad_norm": 2243.718994140625, "learning_rate": 4.374625570388008e-05, "loss": 2.7105, "step": 250 }, { "epoch": 0.2789046653144016, "grad_norm": 11.651649475097656, "learning_rate": 4.2325703468843025e-05, "loss": 2.5379, "step": 275 }, { "epoch": 0.30425963488843816, "grad_norm": 6.585482120513916, "learning_rate": 4.078827920763835e-05, "loss": 2.5446, "step": 300 }, { "epoch": 0.32961460446247465, "grad_norm": 5.859741687774658, "learning_rate": 3.914435374814092e-05, "loss": 2.5159, "step": 325 }, { "epoch": 0.35496957403651114, "grad_norm": 5.331320762634277, "learning_rate": 3.740501633123872e-05, "loss": 2.567, "step": 350 }, { "epoch": 0.3803245436105477, "grad_norm": 5.613489151000977, "learning_rate": 3.558199980740263e-05, "loss": 2.5492, "step": 375 }, { "epoch": 0.4056795131845842, "grad_norm": 5.667895317077637, "learning_rate": 3.368760149173219e-05, "loss": 2.3591, "step": 400 }, { "epoch": 0.43103448275862066, "grad_norm": 5.731890678405762, "learning_rate": 3.1734600211356654e-05, "loss": 2.4657, "step": 425 }, { "epoch": 0.4563894523326572, "grad_norm": 4.859386920928955, "learning_rate": 2.9736170104755075e-05, "loss": 2.4864, "step": 450 }, { "epoch": 0.4817444219066937, "grad_norm": 5.270420551300049, "learning_rate": 2.7705791754469607e-05, "loss": 2.2326, "step": 475 }, { "epoch": 0.5070993914807302, "grad_norm": 5.213580131530762, "learning_rate": 2.5657161252674044e-05, "loss": 2.0531, "step": 500 } ], "logging_steps": 25, "max_steps": 986, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 4.6110257184768e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }