{ "best_metric": 0.6555557250976562, "best_model_checkpoint": "llama2_short/checkpoint-500", "epoch": 4.761904761904762, "eval_steps": 25, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.23809523809523808, "grad_norm": 0.09585642069578171, "learning_rate": 9.986018985905901e-05, "loss": 0.8756, "step": 25 }, { "epoch": 0.23809523809523808, "eval_loss": 0.8215582370758057, "eval_runtime": 31.8561, "eval_samples_per_second": 3.484, "eval_steps_per_second": 0.439, "step": 25 }, { "epoch": 0.47619047619047616, "grad_norm": 0.09733700007200241, "learning_rate": 9.944154131125642e-05, "loss": 0.789, "step": 50 }, { "epoch": 0.47619047619047616, "eval_loss": 0.7655627131462097, "eval_runtime": 31.8837, "eval_samples_per_second": 3.481, "eval_steps_per_second": 0.439, "step": 50 }, { "epoch": 0.7142857142857143, "grad_norm": 0.11074452102184296, "learning_rate": 9.874639560909117e-05, "loss": 0.759, "step": 75 }, { "epoch": 0.7142857142857143, "eval_loss": 0.7467875480651855, "eval_runtime": 31.8741, "eval_samples_per_second": 3.482, "eval_steps_per_second": 0.439, "step": 75 }, { "epoch": 0.9523809523809523, "grad_norm": 0.10002691298723221, "learning_rate": 9.777864028930705e-05, "loss": 0.7402, "step": 100 }, { "epoch": 0.9523809523809523, "eval_loss": 0.7353290915489197, "eval_runtime": 31.8697, "eval_samples_per_second": 3.483, "eval_steps_per_second": 0.439, "step": 100 }, { "epoch": 1.1904761904761905, "grad_norm": 0.12271334230899811, "learning_rate": 9.654368743221022e-05, "loss": 0.7267, "step": 125 }, { "epoch": 1.1904761904761905, "eval_loss": 0.7255786657333374, "eval_runtime": 31.8838, "eval_samples_per_second": 3.481, "eval_steps_per_second": 0.439, "step": 125 }, { "epoch": 1.4285714285714286, "grad_norm": 0.15203851461410522, "learning_rate": 9.504844339512095e-05, "loss": 0.7228, "step": 150 }, { "epoch": 1.4285714285714286, "eval_loss": 0.7192813754081726, "eval_runtime": 31.8619, "eval_samples_per_second": 3.484, "eval_steps_per_second": 0.439, "step": 150 }, { "epoch": 1.6666666666666665, "grad_norm": 0.1312723308801651, "learning_rate": 9.330127018922194e-05, "loss": 0.7173, "step": 175 }, { "epoch": 1.6666666666666665, "eval_loss": 0.7122732400894165, "eval_runtime": 31.8673, "eval_samples_per_second": 3.483, "eval_steps_per_second": 0.439, "step": 175 }, { "epoch": 1.9047619047619047, "grad_norm": 0.1437642127275467, "learning_rate": 9.131193871579975e-05, "loss": 0.7194, "step": 200 }, { "epoch": 1.9047619047619047, "eval_loss": 0.7064856886863708, "eval_runtime": 31.8744, "eval_samples_per_second": 3.482, "eval_steps_per_second": 0.439, "step": 200 }, { "epoch": 2.142857142857143, "grad_norm": 0.16143007576465607, "learning_rate": 8.90915741234015e-05, "loss": 0.6987, "step": 225 }, { "epoch": 2.142857142857143, "eval_loss": 0.7001689076423645, "eval_runtime": 31.8599, "eval_samples_per_second": 3.484, "eval_steps_per_second": 0.439, "step": 225 }, { "epoch": 2.380952380952381, "grad_norm": 0.1549287736415863, "learning_rate": 8.665259359149132e-05, "loss": 0.6941, "step": 250 }, { "epoch": 2.380952380952381, "eval_loss": 0.6950200200080872, "eval_runtime": 31.9601, "eval_samples_per_second": 3.473, "eval_steps_per_second": 0.438, "step": 250 }, { "epoch": 2.619047619047619, "grad_norm": 0.15042544901371002, "learning_rate": 8.400863688854597e-05, "loss": 0.6987, "step": 275 }, { "epoch": 2.619047619047619, "eval_loss": 0.6901896595954895, "eval_runtime": 31.9342, "eval_samples_per_second": 3.476, "eval_steps_per_second": 0.438, "step": 275 }, { "epoch": 2.857142857142857, "grad_norm": 0.17507144808769226, "learning_rate": 8.117449009293668e-05, "loss": 0.6953, "step": 300 }, { "epoch": 2.857142857142857, "eval_loss": 0.6860001087188721, "eval_runtime": 31.8997, "eval_samples_per_second": 3.48, "eval_steps_per_second": 0.439, "step": 300 }, { "epoch": 3.0952380952380953, "grad_norm": 0.16489411890506744, "learning_rate": 7.81660029031811e-05, "loss": 0.6832, "step": 325 }, { "epoch": 3.0952380952380953, "eval_loss": 0.681731104850769, "eval_runtime": 31.8876, "eval_samples_per_second": 3.481, "eval_steps_per_second": 0.439, "step": 325 }, { "epoch": 3.3333333333333335, "grad_norm": 0.17146223783493042, "learning_rate": 7.500000000000001e-05, "loss": 0.6784, "step": 350 }, { "epoch": 3.3333333333333335, "eval_loss": 0.6773594617843628, "eval_runtime": 31.8931, "eval_samples_per_second": 3.48, "eval_steps_per_second": 0.439, "step": 350 }, { "epoch": 3.571428571428571, "grad_norm": 0.1854155957698822, "learning_rate": 7.169418695587791e-05, "loss": 0.6756, "step": 375 }, { "epoch": 3.571428571428571, "eval_loss": 0.6735982298851013, "eval_runtime": 31.966, "eval_samples_per_second": 3.472, "eval_steps_per_second": 0.438, "step": 375 }, { "epoch": 3.8095238095238093, "grad_norm": 0.19128429889678955, "learning_rate": 6.826705121831976e-05, "loss": 0.6771, "step": 400 }, { "epoch": 3.8095238095238093, "eval_loss": 0.66991126537323, "eval_runtime": 31.8972, "eval_samples_per_second": 3.48, "eval_steps_per_second": 0.439, "step": 400 }, { "epoch": 4.0476190476190474, "grad_norm": 0.2203032523393631, "learning_rate": 6.473775872054521e-05, "loss": 0.6761, "step": 425 }, { "epoch": 4.0476190476190474, "eval_loss": 0.6663544774055481, "eval_runtime": 31.9622, "eval_samples_per_second": 3.473, "eval_steps_per_second": 0.438, "step": 425 }, { "epoch": 4.285714285714286, "grad_norm": 0.2110411375761032, "learning_rate": 6.112604669781572e-05, "loss": 0.664, "step": 450 }, { "epoch": 4.285714285714286, "eval_loss": 0.6635292768478394, "eval_runtime": 31.9093, "eval_samples_per_second": 3.479, "eval_steps_per_second": 0.439, "step": 450 }, { "epoch": 4.523809523809524, "grad_norm": 0.21833977103233337, "learning_rate": 5.745211330880872e-05, "loss": 0.6618, "step": 475 }, { "epoch": 4.523809523809524, "eval_loss": 0.6595950722694397, "eval_runtime": 31.9849, "eval_samples_per_second": 3.47, "eval_steps_per_second": 0.438, "step": 475 }, { "epoch": 4.761904761904762, "grad_norm": 0.21036215126514435, "learning_rate": 5.373650467932122e-05, "loss": 0.6614, "step": 500 }, { "epoch": 4.761904761904762, "eval_loss": 0.6555557250976562, "eval_runtime": 31.9033, "eval_samples_per_second": 3.479, "eval_steps_per_second": 0.439, "step": 500 } ], "logging_steps": 25, "max_steps": 1050, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 25, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 10, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.255887790814003e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }