|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1000.0, |
|
"eval_steps": 500, |
|
"global_step": 8000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 62.5, |
|
"grad_norm": 0.2072206288576126, |
|
"learning_rate": 0.0009375, |
|
"loss": 0.3349, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 62.5, |
|
"eval_loss": 0.21073544025421143, |
|
"eval_runtime": 1.7576, |
|
"eval_samples_per_second": 4470.361, |
|
"eval_steps_per_second": 35.276, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 125.0, |
|
"grad_norm": 0.20736312866210938, |
|
"learning_rate": 0.000875, |
|
"loss": 0.1147, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 125.0, |
|
"eval_loss": 0.2409573197364807, |
|
"eval_runtime": 1.7582, |
|
"eval_samples_per_second": 4468.854, |
|
"eval_steps_per_second": 35.264, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 187.5, |
|
"grad_norm": 0.17418691515922546, |
|
"learning_rate": 0.0008125000000000001, |
|
"loss": 0.071, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 187.5, |
|
"eval_loss": 0.2686581015586853, |
|
"eval_runtime": 1.7698, |
|
"eval_samples_per_second": 4439.56, |
|
"eval_steps_per_second": 35.033, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 250.0, |
|
"grad_norm": 0.1891753375530243, |
|
"learning_rate": 0.00075, |
|
"loss": 0.0502, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 250.0, |
|
"eval_loss": 0.29014259576797485, |
|
"eval_runtime": 1.7819, |
|
"eval_samples_per_second": 4409.274, |
|
"eval_steps_per_second": 34.794, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 312.5, |
|
"grad_norm": 0.16434858739376068, |
|
"learning_rate": 0.0006875, |
|
"loss": 0.0373, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 312.5, |
|
"eval_loss": 0.3033287525177002, |
|
"eval_runtime": 1.7585, |
|
"eval_samples_per_second": 4468.05, |
|
"eval_steps_per_second": 35.258, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 375.0, |
|
"grad_norm": 0.11065812408924103, |
|
"learning_rate": 0.000625, |
|
"loss": 0.0301, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 375.0, |
|
"eval_loss": 0.31410983204841614, |
|
"eval_runtime": 1.7765, |
|
"eval_samples_per_second": 4422.629, |
|
"eval_steps_per_second": 34.899, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 437.5, |
|
"grad_norm": 0.12541820108890533, |
|
"learning_rate": 0.0005625000000000001, |
|
"loss": 0.025, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 437.5, |
|
"eval_loss": 0.3234836161136627, |
|
"eval_runtime": 1.8714, |
|
"eval_samples_per_second": 4198.44, |
|
"eval_steps_per_second": 33.13, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 500.0, |
|
"grad_norm": 0.12353639304637909, |
|
"learning_rate": 0.0005, |
|
"loss": 0.0212, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 500.0, |
|
"eval_loss": 0.33116382360458374, |
|
"eval_runtime": 2.0737, |
|
"eval_samples_per_second": 3788.95, |
|
"eval_steps_per_second": 29.899, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 562.5, |
|
"grad_norm": 0.14147008955478668, |
|
"learning_rate": 0.0004375, |
|
"loss": 0.0187, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 562.5, |
|
"eval_loss": 0.3403891623020172, |
|
"eval_runtime": 1.7439, |
|
"eval_samples_per_second": 4505.522, |
|
"eval_steps_per_second": 35.553, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 625.0, |
|
"grad_norm": 0.17466695606708527, |
|
"learning_rate": 0.000375, |
|
"loss": 0.017, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 625.0, |
|
"eval_loss": 0.3371485471725464, |
|
"eval_runtime": 1.7727, |
|
"eval_samples_per_second": 4432.252, |
|
"eval_steps_per_second": 34.975, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 687.5, |
|
"grad_norm": 0.1278233528137207, |
|
"learning_rate": 0.0003125, |
|
"loss": 0.0148, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 687.5, |
|
"eval_loss": 0.3466494679450989, |
|
"eval_runtime": 1.7776, |
|
"eval_samples_per_second": 4420.019, |
|
"eval_steps_per_second": 34.879, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 750.0, |
|
"grad_norm": 0.08440423011779785, |
|
"learning_rate": 0.00025, |
|
"loss": 0.0139, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 750.0, |
|
"eval_loss": 0.34804055094718933, |
|
"eval_runtime": 1.7744, |
|
"eval_samples_per_second": 4427.887, |
|
"eval_steps_per_second": 34.941, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 812.5, |
|
"grad_norm": 0.1064932644367218, |
|
"learning_rate": 0.0001875, |
|
"loss": 0.0124, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 812.5, |
|
"eval_loss": 0.3551761209964752, |
|
"eval_runtime": 1.7547, |
|
"eval_samples_per_second": 4477.816, |
|
"eval_steps_per_second": 35.335, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 875.0, |
|
"grad_norm": 0.07584909349679947, |
|
"learning_rate": 0.000125, |
|
"loss": 0.0118, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 875.0, |
|
"eval_loss": 0.3594026267528534, |
|
"eval_runtime": 1.7559, |
|
"eval_samples_per_second": 4474.601, |
|
"eval_steps_per_second": 35.309, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 937.5, |
|
"grad_norm": 0.04748733341693878, |
|
"learning_rate": 6.25e-05, |
|
"loss": 0.0112, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 937.5, |
|
"eval_loss": 0.35810717940330505, |
|
"eval_runtime": 1.7577, |
|
"eval_samples_per_second": 4469.966, |
|
"eval_steps_per_second": 35.273, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1000.0, |
|
"grad_norm": 0.0842234194278717, |
|
"learning_rate": 0.0, |
|
"loss": 0.0106, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1000.0, |
|
"eval_loss": 0.35844165086746216, |
|
"eval_runtime": 1.7498, |
|
"eval_samples_per_second": 4490.355, |
|
"eval_steps_per_second": 35.434, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1000.0, |
|
"step": 8000, |
|
"total_flos": 3.308878023570227e+16, |
|
"train_loss": 0.04967061561346054, |
|
"train_runtime": 721.4078, |
|
"train_samples_per_second": 1386.178, |
|
"train_steps_per_second": 11.089 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 8000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1000, |
|
"save_steps": 500, |
|
"total_flos": 3.308878023570227e+16, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|