{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 21994, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.045466945530599254, "grad_norm": 39.23893737792969, "learning_rate": 6.818181818181818e-06, "loss": 3.9076, "step": 500 }, { "epoch": 0.09093389106119851, "grad_norm": 18.871334075927734, "learning_rate": 1.3636363636363637e-05, "loss": 1.415, "step": 1000 }, { "epoch": 0.13640083659179777, "grad_norm": 14.570379257202148, "learning_rate": 2.0454545454545454e-05, "loss": 1.2569, "step": 1500 }, { "epoch": 0.18186778212239701, "grad_norm": 16.230432510375977, "learning_rate": 2.7272727272727273e-05, "loss": 1.2395, "step": 2000 }, { "epoch": 0.22733472765299628, "grad_norm": 15.570598602294922, "learning_rate": 2.954531676265535e-05, "loss": 1.2152, "step": 2500 }, { "epoch": 0.27280167318359555, "grad_norm": 14.000185012817383, "learning_rate": 2.8787511367080933e-05, "loss": 1.1669, "step": 3000 }, { "epoch": 0.31826861871419476, "grad_norm": 19.740062713623047, "learning_rate": 2.802970597150652e-05, "loss": 1.1665, "step": 3500 }, { "epoch": 0.36373556424479403, "grad_norm": 13.226868629455566, "learning_rate": 2.7271900575932103e-05, "loss": 1.1422, "step": 4000 }, { "epoch": 0.4092025097753933, "grad_norm": 8.243535041809082, "learning_rate": 2.6514095180357682e-05, "loss": 1.115, "step": 4500 }, { "epoch": 0.45466945530599256, "grad_norm": 23.291852951049805, "learning_rate": 2.575628978478327e-05, "loss": 1.1059, "step": 5000 }, { "epoch": 0.5001364008365918, "grad_norm": 14.429057121276855, "learning_rate": 2.499848438920885e-05, "loss": 1.1121, "step": 5500 }, { "epoch": 0.5456033463671911, "grad_norm": 9.61043930053711, "learning_rate": 2.4240678993634438e-05, "loss": 1.0893, "step": 6000 }, { "epoch": 0.5910702918977903, "grad_norm": 7.90361213684082, "learning_rate": 2.3482873598060018e-05, "loss": 1.0813, "step": 6500 }, { "epoch": 0.6365372374283895, "grad_norm": 11.351470947265625, "learning_rate": 2.27250682024856e-05, "loss": 1.0667, "step": 7000 }, { "epoch": 0.6820041829589888, "grad_norm": 15.591328620910645, "learning_rate": 2.1967262806911187e-05, "loss": 1.0682, "step": 7500 }, { "epoch": 0.7274711284895881, "grad_norm": 9.37597942352295, "learning_rate": 2.120945741133677e-05, "loss": 1.0462, "step": 8000 }, { "epoch": 0.7729380740201873, "grad_norm": 17.270828247070312, "learning_rate": 2.0451652015762353e-05, "loss": 1.0626, "step": 8500 }, { "epoch": 0.8184050195507866, "grad_norm": 14.909830093383789, "learning_rate": 1.9693846620187936e-05, "loss": 1.0214, "step": 9000 }, { "epoch": 0.8638719650813859, "grad_norm": 7.522629261016846, "learning_rate": 1.893604122461352e-05, "loss": 1.0459, "step": 9500 }, { "epoch": 0.9093389106119851, "grad_norm": 47.28670883178711, "learning_rate": 1.8178235829039105e-05, "loss": 1.0277, "step": 10000 }, { "epoch": 0.9548058561425843, "grad_norm": 14.066143989562988, "learning_rate": 1.7420430433464688e-05, "loss": 1.0183, "step": 10500 }, { "epoch": 1.0002728016731837, "grad_norm": 10.882994651794434, "learning_rate": 1.6662625037890268e-05, "loss": 0.9878, "step": 11000 }, { "epoch": 1.0457397472037828, "grad_norm": 8.7605562210083, "learning_rate": 1.5904819642315854e-05, "loss": 0.8088, "step": 11500 }, { "epoch": 1.0912066927343822, "grad_norm": 5.1129326820373535, "learning_rate": 1.5147014246741437e-05, "loss": 0.8296, "step": 12000 }, { "epoch": 1.1366736382649814, "grad_norm": 11.089341163635254, "learning_rate": 1.438920885116702e-05, "loss": 0.8104, "step": 12500 }, { "epoch": 1.1821405837955807, "grad_norm": 10.97964096069336, "learning_rate": 1.3631403455592605e-05, "loss": 0.8171, "step": 13000 }, { "epoch": 1.22760752932618, "grad_norm": 21.10997200012207, "learning_rate": 1.2873598060018188e-05, "loss": 0.8129, "step": 13500 }, { "epoch": 1.273074474856779, "grad_norm": 1.2680716514587402, "learning_rate": 1.211579266444377e-05, "loss": 0.7884, "step": 14000 }, { "epoch": 1.3185414203873784, "grad_norm": 11.013956069946289, "learning_rate": 1.1357987268869355e-05, "loss": 0.8036, "step": 14500 }, { "epoch": 1.3640083659179776, "grad_norm": 4.082338333129883, "learning_rate": 1.0600181873294938e-05, "loss": 0.7772, "step": 15000 }, { "epoch": 1.409475311448577, "grad_norm": 5.8771071434021, "learning_rate": 9.842376477720523e-06, "loss": 0.8058, "step": 15500 }, { "epoch": 1.4549422569791761, "grad_norm": 10.076021194458008, "learning_rate": 9.084571082146104e-06, "loss": 0.7847, "step": 16000 }, { "epoch": 1.5004092025097755, "grad_norm": 7.645974636077881, "learning_rate": 8.326765686571689e-06, "loss": 0.7793, "step": 16500 }, { "epoch": 1.5458761480403747, "grad_norm": 15.3417329788208, "learning_rate": 7.568960290997272e-06, "loss": 0.7844, "step": 17000 }, { "epoch": 1.5913430935709738, "grad_norm": 6.472328186035156, "learning_rate": 6.8111548954228554e-06, "loss": 0.7767, "step": 17500 }, { "epoch": 1.6368100391015732, "grad_norm": 10.42813777923584, "learning_rate": 6.053349499848439e-06, "loss": 0.7792, "step": 18000 }, { "epoch": 1.6822769846321726, "grad_norm": 3.3688242435455322, "learning_rate": 5.295544104274023e-06, "loss": 0.7702, "step": 18500 }, { "epoch": 1.7277439301627715, "grad_norm": 5.880104064941406, "learning_rate": 4.537738708699606e-06, "loss": 0.8026, "step": 19000 }, { "epoch": 1.7732108756933709, "grad_norm": 20.736509323120117, "learning_rate": 3.7799333131251894e-06, "loss": 0.7483, "step": 19500 }, { "epoch": 1.8186778212239703, "grad_norm": 5.447836399078369, "learning_rate": 3.0221279175507728e-06, "loss": 0.7301, "step": 20000 }, { "epoch": 1.8641447667545694, "grad_norm": 6.6951141357421875, "learning_rate": 2.264322521976356e-06, "loss": 0.7655, "step": 20500 }, { "epoch": 1.9096117122851686, "grad_norm": 5.314289093017578, "learning_rate": 1.50651712640194e-06, "loss": 0.7631, "step": 21000 }, { "epoch": 1.955078657815768, "grad_norm": 9.385791778564453, "learning_rate": 7.487117308275235e-07, "loss": 0.744, "step": 21500 }, { "epoch": 2.0, "step": 21994, "total_flos": 1.2277795031512474e+17, "train_loss": 1.0133790219709247, "train_runtime": 87315.1804, "train_samples_per_second": 3.023, "train_steps_per_second": 0.252 } ], "logging_steps": 500, "max_steps": 21994, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2277795031512474e+17, "train_batch_size": 12, "trial_name": null, "trial_params": null }