{ "best_metric": 0.8310017585754395, "best_model_checkpoint": "data/Mistral-7B_task-1_120-samples_config-2_full_auto/checkpoint-33", "epoch": 12.909090909090908, "eval_steps": 500, "global_step": 71, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.18181818181818182, "grad_norm": 1.9838634729385376, "learning_rate": 4.000000000000001e-06, "loss": 1.7113, "step": 1 }, { "epoch": 0.36363636363636365, "grad_norm": 2.1368112564086914, "learning_rate": 8.000000000000001e-06, "loss": 1.8137, "step": 2 }, { "epoch": 0.7272727272727273, "grad_norm": 1.8914555311203003, "learning_rate": 1.6000000000000003e-05, "loss": 1.7345, "step": 4 }, { "epoch": 0.9090909090909091, "eval_loss": 1.6530523300170898, "eval_runtime": 9.9736, "eval_samples_per_second": 2.406, "eval_steps_per_second": 2.406, "step": 5 }, { "epoch": 1.0909090909090908, "grad_norm": 1.875381588935852, "learning_rate": 2.4e-05, "loss": 1.7388, "step": 6 }, { "epoch": 1.4545454545454546, "grad_norm": 1.2540829181671143, "learning_rate": 3.2000000000000005e-05, "loss": 1.5623, "step": 8 }, { "epoch": 1.8181818181818183, "grad_norm": 1.1384962797164917, "learning_rate": 4e-05, "loss": 1.5053, "step": 10 }, { "epoch": 2.0, "eval_loss": 1.3127946853637695, "eval_runtime": 9.9703, "eval_samples_per_second": 2.407, "eval_steps_per_second": 2.407, "step": 11 }, { "epoch": 2.1818181818181817, "grad_norm": 1.1722956895828247, "learning_rate": 4.8e-05, "loss": 1.2826, "step": 12 }, { "epoch": 2.5454545454545454, "grad_norm": 1.5166568756103516, "learning_rate": 5.6000000000000006e-05, "loss": 1.2215, "step": 14 }, { "epoch": 2.909090909090909, "grad_norm": 1.1790285110473633, "learning_rate": 6.400000000000001e-05, "loss": 1.0785, "step": 16 }, { "epoch": 2.909090909090909, "eval_loss": 0.9975817203521729, "eval_runtime": 9.9664, "eval_samples_per_second": 2.408, "eval_steps_per_second": 2.408, "step": 16 }, { "epoch": 3.2727272727272725, "grad_norm": 1.091530203819275, "learning_rate": 7.2e-05, "loss": 0.939, "step": 18 }, { "epoch": 3.6363636363636362, "grad_norm": 0.8115152716636658, "learning_rate": 8e-05, "loss": 0.8785, "step": 20 }, { "epoch": 4.0, "grad_norm": 0.7138871550559998, "learning_rate": 8.800000000000001e-05, "loss": 0.8576, "step": 22 }, { "epoch": 4.0, "eval_loss": 0.8724855780601501, "eval_runtime": 9.9808, "eval_samples_per_second": 2.405, "eval_steps_per_second": 2.405, "step": 22 }, { "epoch": 4.363636363636363, "grad_norm": 0.621791660785675, "learning_rate": 9.6e-05, "loss": 0.8156, "step": 24 }, { "epoch": 4.7272727272727275, "grad_norm": 0.547651469707489, "learning_rate": 9.999512620046522e-05, "loss": 0.7886, "step": 26 }, { "epoch": 4.909090909090909, "eval_loss": 0.835381031036377, "eval_runtime": 9.9679, "eval_samples_per_second": 2.408, "eval_steps_per_second": 2.408, "step": 27 }, { "epoch": 5.090909090909091, "grad_norm": 0.6008999347686768, "learning_rate": 9.995614150494293e-05, "loss": 0.7344, "step": 28 }, { "epoch": 5.454545454545454, "grad_norm": 0.5711919665336609, "learning_rate": 9.987820251299122e-05, "loss": 0.6908, "step": 30 }, { "epoch": 5.818181818181818, "grad_norm": 0.6187372207641602, "learning_rate": 9.976136999909156e-05, "loss": 0.6952, "step": 32 }, { "epoch": 6.0, "eval_loss": 0.8310017585754395, "eval_runtime": 9.9648, "eval_samples_per_second": 2.408, "eval_steps_per_second": 2.408, "step": 33 }, { "epoch": 6.181818181818182, "grad_norm": 0.6039671897888184, "learning_rate": 9.96057350657239e-05, "loss": 0.6334, "step": 34 }, { "epoch": 6.545454545454545, "grad_norm": 0.650557279586792, "learning_rate": 9.941141907232765e-05, "loss": 0.6245, "step": 36 }, { "epoch": 6.909090909090909, "grad_norm": 0.7548458576202393, "learning_rate": 9.917857354066931e-05, "loss": 0.5826, "step": 38 }, { "epoch": 6.909090909090909, "eval_loss": 0.8411148190498352, "eval_runtime": 9.9977, "eval_samples_per_second": 2.401, "eval_steps_per_second": 2.401, "step": 38 }, { "epoch": 7.2727272727272725, "grad_norm": 0.7017354965209961, "learning_rate": 9.890738003669029e-05, "loss": 0.532, "step": 40 }, { "epoch": 7.636363636363637, "grad_norm": 0.8429603576660156, "learning_rate": 9.859805002892732e-05, "loss": 0.462, "step": 42 }, { "epoch": 8.0, "grad_norm": 1.2680151462554932, "learning_rate": 9.825082472361557e-05, "loss": 0.4708, "step": 44 }, { "epoch": 8.0, "eval_loss": 0.8912167549133301, "eval_runtime": 9.9668, "eval_samples_per_second": 2.408, "eval_steps_per_second": 2.408, "step": 44 }, { "epoch": 8.363636363636363, "grad_norm": 0.962690532207489, "learning_rate": 9.786597487660337e-05, "loss": 0.3822, "step": 46 }, { "epoch": 8.727272727272727, "grad_norm": 0.9863336682319641, "learning_rate": 9.744380058222483e-05, "loss": 0.3586, "step": 48 }, { "epoch": 8.909090909090908, "eval_loss": 0.9640557765960693, "eval_runtime": 9.9737, "eval_samples_per_second": 2.406, "eval_steps_per_second": 2.406, "step": 49 }, { "epoch": 9.090909090909092, "grad_norm": 1.2356395721435547, "learning_rate": 9.698463103929542e-05, "loss": 0.3379, "step": 50 }, { "epoch": 9.454545454545455, "grad_norm": 1.0257391929626465, "learning_rate": 9.648882429441257e-05, "loss": 0.261, "step": 52 }, { "epoch": 9.818181818181818, "grad_norm": 1.2543127536773682, "learning_rate": 9.595676696276172e-05, "loss": 0.2553, "step": 54 }, { "epoch": 10.0, "eval_loss": 1.0596362352371216, "eval_runtime": 9.9679, "eval_samples_per_second": 2.408, "eval_steps_per_second": 2.408, "step": 55 }, { "epoch": 10.181818181818182, "grad_norm": 0.8665857911109924, "learning_rate": 9.538887392664544e-05, "loss": 0.2243, "step": 56 }, { "epoch": 10.545454545454545, "grad_norm": 0.9961050152778625, "learning_rate": 9.478558801197065e-05, "loss": 0.193, "step": 58 }, { "epoch": 10.909090909090908, "grad_norm": 1.193697452545166, "learning_rate": 9.414737964294636e-05, "loss": 0.2006, "step": 60 }, { "epoch": 10.909090909090908, "eval_loss": 1.1654163599014282, "eval_runtime": 9.9703, "eval_samples_per_second": 2.407, "eval_steps_per_second": 2.407, "step": 60 }, { "epoch": 11.272727272727273, "grad_norm": 0.8195732831954956, "learning_rate": 9.347474647526095e-05, "loss": 0.1562, "step": 62 }, { "epoch": 11.636363636363637, "grad_norm": 1.2003705501556396, "learning_rate": 9.276821300802534e-05, "loss": 0.1497, "step": 64 }, { "epoch": 12.0, "grad_norm": 1.547230839729309, "learning_rate": 9.202833017478422e-05, "loss": 0.1612, "step": 66 }, { "epoch": 12.0, "eval_loss": 1.235763430595398, "eval_runtime": 9.9721, "eval_samples_per_second": 2.407, "eval_steps_per_second": 2.407, "step": 66 }, { "epoch": 12.363636363636363, "grad_norm": 0.8761946558952332, "learning_rate": 9.125567491391476e-05, "loss": 0.1261, "step": 68 }, { "epoch": 12.727272727272727, "grad_norm": 1.0304591655731201, "learning_rate": 9.045084971874738e-05, "loss": 0.1398, "step": 70 }, { "epoch": 12.909090909090908, "eval_loss": 1.2212626934051514, "eval_runtime": 9.9686, "eval_samples_per_second": 2.408, "eval_steps_per_second": 2.408, "step": 71 }, { "epoch": 12.909090909090908, "step": 71, "total_flos": 1.885257891787571e+16, "train_loss": 0.6939811198644235, "train_runtime": 1389.882, "train_samples_per_second": 3.166, "train_steps_per_second": 0.18 } ], "logging_steps": 2, "max_steps": 250, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 25, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 7, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.885257891787571e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }