|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 100, |
|
"global_step": 360, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.06944444444444445, |
|
"grad_norm": 6.3636603355407715, |
|
"learning_rate": 1e-06, |
|
"loss": 2.6727, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.1388888888888889, |
|
"grad_norm": 7.486879825592041, |
|
"learning_rate": 1e-06, |
|
"loss": 2.3642, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.20833333333333334, |
|
"grad_norm": 6.5991997718811035, |
|
"learning_rate": 1e-06, |
|
"loss": 2.515, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.2777777777777778, |
|
"grad_norm": 7.580630779266357, |
|
"learning_rate": 1e-06, |
|
"loss": 2.3997, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.3472222222222222, |
|
"grad_norm": 7.5727410316467285, |
|
"learning_rate": 1e-06, |
|
"loss": 2.4551, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"grad_norm": 8.835946083068848, |
|
"learning_rate": 1e-06, |
|
"loss": 2.4476, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.4861111111111111, |
|
"grad_norm": 7.495606899261475, |
|
"learning_rate": 1e-06, |
|
"loss": 2.4191, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.5555555555555556, |
|
"grad_norm": 8.057035446166992, |
|
"learning_rate": 1e-06, |
|
"loss": 2.441, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 6.828744411468506, |
|
"learning_rate": 1e-06, |
|
"loss": 2.3052, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.6944444444444444, |
|
"grad_norm": 7.163251876831055, |
|
"learning_rate": 1e-06, |
|
"loss": 2.1357, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.7638888888888888, |
|
"grad_norm": 5.414941787719727, |
|
"learning_rate": 1e-06, |
|
"loss": 2.2248, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 6.0801544189453125, |
|
"learning_rate": 1e-06, |
|
"loss": 2.2934, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.9027777777777778, |
|
"grad_norm": 6.054081439971924, |
|
"learning_rate": 1e-06, |
|
"loss": 2.3014, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.9722222222222222, |
|
"grad_norm": 5.827741622924805, |
|
"learning_rate": 1e-06, |
|
"loss": 2.2515, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.0416666666666667, |
|
"grad_norm": 3.5676162242889404, |
|
"learning_rate": 1e-06, |
|
"loss": 2.0915, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"grad_norm": 5.15900993347168, |
|
"learning_rate": 1e-06, |
|
"loss": 2.0749, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.1805555555555556, |
|
"grad_norm": 5.206437110900879, |
|
"learning_rate": 1e-06, |
|
"loss": 2.0539, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 5.990969657897949, |
|
"learning_rate": 1e-06, |
|
"loss": 2.1308, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.3194444444444444, |
|
"grad_norm": 6.198008060455322, |
|
"learning_rate": 1e-06, |
|
"loss": 2.3256, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.3888888888888888, |
|
"grad_norm": 5.184628486633301, |
|
"learning_rate": 1e-06, |
|
"loss": 2.1566, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.3888888888888888, |
|
"eval_loss": 2.0880796909332275, |
|
"eval_runtime": 34.0667, |
|
"eval_samples_per_second": 2.935, |
|
"eval_steps_per_second": 0.734, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.4583333333333333, |
|
"grad_norm": 5.412724494934082, |
|
"learning_rate": 1e-06, |
|
"loss": 1.9085, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.5277777777777777, |
|
"grad_norm": 3.459959030151367, |
|
"learning_rate": 1e-06, |
|
"loss": 1.9494, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.5972222222222223, |
|
"grad_norm": 5.159445762634277, |
|
"learning_rate": 1e-06, |
|
"loss": 1.9334, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 5.133082389831543, |
|
"learning_rate": 1e-06, |
|
"loss": 2.0826, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.7361111111111112, |
|
"grad_norm": 4.473026752471924, |
|
"learning_rate": 1e-06, |
|
"loss": 2.0585, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.8055555555555556, |
|
"grad_norm": 5.063863754272461, |
|
"learning_rate": 1e-06, |
|
"loss": 2.1289, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 4.927737236022949, |
|
"learning_rate": 1e-06, |
|
"loss": 1.9872, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.9444444444444444, |
|
"grad_norm": 5.563902854919434, |
|
"learning_rate": 1e-06, |
|
"loss": 1.9803, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.013888888888889, |
|
"grad_norm": 3.901442050933838, |
|
"learning_rate": 1e-06, |
|
"loss": 1.8309, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 2.0833333333333335, |
|
"grad_norm": 3.771136999130249, |
|
"learning_rate": 1e-06, |
|
"loss": 1.7758, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.1527777777777777, |
|
"grad_norm": 4.6159257888793945, |
|
"learning_rate": 1e-06, |
|
"loss": 1.9193, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 3.758843183517456, |
|
"learning_rate": 1e-06, |
|
"loss": 1.9329, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.2916666666666665, |
|
"grad_norm": 4.267579078674316, |
|
"learning_rate": 1e-06, |
|
"loss": 2.0399, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 2.361111111111111, |
|
"grad_norm": 3.9819560050964355, |
|
"learning_rate": 1e-06, |
|
"loss": 1.9568, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.4305555555555554, |
|
"grad_norm": 3.8918192386627197, |
|
"learning_rate": 1e-06, |
|
"loss": 1.7377, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 3.9746928215026855, |
|
"learning_rate": 1e-06, |
|
"loss": 1.8949, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.5694444444444446, |
|
"grad_norm": 3.328784704208374, |
|
"learning_rate": 1e-06, |
|
"loss": 1.6509, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 2.638888888888889, |
|
"grad_norm": 3.835324287414551, |
|
"learning_rate": 1e-06, |
|
"loss": 1.8321, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.7083333333333335, |
|
"grad_norm": 3.3603885173797607, |
|
"learning_rate": 1e-06, |
|
"loss": 1.8628, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 2.7777777777777777, |
|
"grad_norm": 3.7577502727508545, |
|
"learning_rate": 1e-06, |
|
"loss": 1.8447, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.7777777777777777, |
|
"eval_loss": 1.8452154397964478, |
|
"eval_runtime": 34.0911, |
|
"eval_samples_per_second": 2.933, |
|
"eval_steps_per_second": 0.733, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.8472222222222223, |
|
"grad_norm": 4.379385948181152, |
|
"learning_rate": 1e-06, |
|
"loss": 1.8212, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 2.9166666666666665, |
|
"grad_norm": 3.7095022201538086, |
|
"learning_rate": 1e-06, |
|
"loss": 1.7862, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.986111111111111, |
|
"grad_norm": 4.164438724517822, |
|
"learning_rate": 1e-06, |
|
"loss": 1.8046, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 3.0555555555555554, |
|
"grad_norm": 3.6749582290649414, |
|
"learning_rate": 1e-06, |
|
"loss": 1.6358, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"grad_norm": 3.7247958183288574, |
|
"learning_rate": 1e-06, |
|
"loss": 1.791, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 3.1944444444444446, |
|
"grad_norm": 2.9533472061157227, |
|
"learning_rate": 1e-06, |
|
"loss": 1.6251, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.263888888888889, |
|
"grad_norm": 4.062502384185791, |
|
"learning_rate": 1e-06, |
|
"loss": 1.6976, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 4.328882217407227, |
|
"learning_rate": 1e-06, |
|
"loss": 1.8438, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.4027777777777777, |
|
"grad_norm": 4.158596038818359, |
|
"learning_rate": 1e-06, |
|
"loss": 1.8998, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 3.4722222222222223, |
|
"grad_norm": 5.7752556800842285, |
|
"learning_rate": 1e-06, |
|
"loss": 1.7517, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.5416666666666665, |
|
"grad_norm": 4.568635940551758, |
|
"learning_rate": 1e-06, |
|
"loss": 1.6835, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 3.611111111111111, |
|
"grad_norm": 3.6611974239349365, |
|
"learning_rate": 1e-06, |
|
"loss": 1.7852, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 3.6805555555555554, |
|
"grad_norm": 4.026912212371826, |
|
"learning_rate": 1e-06, |
|
"loss": 1.7916, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 4.750195026397705, |
|
"learning_rate": 1e-06, |
|
"loss": 1.7584, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 3.8194444444444446, |
|
"grad_norm": 3.936798572540283, |
|
"learning_rate": 1e-06, |
|
"loss": 1.5877, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 3.888888888888889, |
|
"grad_norm": 4.1127800941467285, |
|
"learning_rate": 1e-06, |
|
"loss": 1.5392, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 3.9583333333333335, |
|
"grad_norm": 3.6437580585479736, |
|
"learning_rate": 1e-06, |
|
"loss": 1.6125, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 4.027777777777778, |
|
"grad_norm": 3.641177177429199, |
|
"learning_rate": 1e-06, |
|
"loss": 1.687, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 4.097222222222222, |
|
"grad_norm": 3.797327995300293, |
|
"learning_rate": 1e-06, |
|
"loss": 1.7779, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 4.166666666666667, |
|
"grad_norm": 5.071943283081055, |
|
"learning_rate": 1e-06, |
|
"loss": 1.7103, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 4.166666666666667, |
|
"eval_loss": 1.6850143671035767, |
|
"eval_runtime": 34.4694, |
|
"eval_samples_per_second": 2.901, |
|
"eval_steps_per_second": 0.725, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 4.236111111111111, |
|
"grad_norm": 6.09140682220459, |
|
"learning_rate": 1e-06, |
|
"loss": 1.6347, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 4.305555555555555, |
|
"grad_norm": 5.452902317047119, |
|
"learning_rate": 1e-06, |
|
"loss": 1.7689, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 4.375, |
|
"grad_norm": 3.5834009647369385, |
|
"learning_rate": 1e-06, |
|
"loss": 1.6514, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 4.444444444444445, |
|
"grad_norm": 3.288220167160034, |
|
"learning_rate": 1e-06, |
|
"loss": 1.4941, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 4.513888888888889, |
|
"grad_norm": 4.202756404876709, |
|
"learning_rate": 1e-06, |
|
"loss": 1.5374, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 4.583333333333333, |
|
"grad_norm": 3.9757556915283203, |
|
"learning_rate": 1e-06, |
|
"loss": 1.6289, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 4.652777777777778, |
|
"grad_norm": 3.3575947284698486, |
|
"learning_rate": 1e-06, |
|
"loss": 1.5446, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 4.722222222222222, |
|
"grad_norm": 4.207667350769043, |
|
"learning_rate": 1e-06, |
|
"loss": 1.5668, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 4.791666666666667, |
|
"grad_norm": 3.2263221740722656, |
|
"learning_rate": 1e-06, |
|
"loss": 1.4529, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 4.861111111111111, |
|
"grad_norm": 3.272395610809326, |
|
"learning_rate": 1e-06, |
|
"loss": 1.5215, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 4.930555555555555, |
|
"grad_norm": 3.4315106868743896, |
|
"learning_rate": 1e-06, |
|
"loss": 1.5781, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 3.9581406116485596, |
|
"learning_rate": 1e-06, |
|
"loss": 1.5001, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 360, |
|
"total_flos": 2.3797808143060173e+17, |
|
"train_loss": 1.9143991947174073, |
|
"train_runtime": 6464.4185, |
|
"train_samples_per_second": 0.891, |
|
"train_steps_per_second": 0.056 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 360, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 100, |
|
"total_flos": 2.3797808143060173e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|