|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.9936, |
|
"eval_steps": 500, |
|
"global_step": 312, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 0.3010159432888031, |
|
"learning_rate": 3.191489361702128e-05, |
|
"loss": 2.2514, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 0.2521279752254486, |
|
"learning_rate": 6.382978723404256e-05, |
|
"loss": 2.1582, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 0.2557184100151062, |
|
"learning_rate": 9.574468085106384e-05, |
|
"loss": 1.8683, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 0.22737619280815125, |
|
"learning_rate": 0.00012765957446808513, |
|
"loss": 1.6607, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.19700799882411957, |
|
"learning_rate": 0.00015957446808510637, |
|
"loss": 1.4682, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.152, |
|
"grad_norm": 0.2368774265050888, |
|
"learning_rate": 0.00019148936170212768, |
|
"loss": 1.3196, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.3439999999999999, |
|
"grad_norm": 0.3574005365371704, |
|
"learning_rate": 0.00018990825688073394, |
|
"loss": 1.1283, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.536, |
|
"grad_norm": 0.3291820287704468, |
|
"learning_rate": 0.0001761467889908257, |
|
"loss": 0.9629, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.728, |
|
"grad_norm": 0.4235256612300873, |
|
"learning_rate": 0.00016238532110091745, |
|
"loss": 0.8761, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.3522832691669464, |
|
"learning_rate": 0.00014862385321100919, |
|
"loss": 0.7764, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.112, |
|
"grad_norm": 0.3914245367050171, |
|
"learning_rate": 0.00013486238532110092, |
|
"loss": 0.6717, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 2.304, |
|
"grad_norm": 0.3502177894115448, |
|
"learning_rate": 0.00012110091743119268, |
|
"loss": 0.6329, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.496, |
|
"grad_norm": 0.5064918398857117, |
|
"learning_rate": 0.0001073394495412844, |
|
"loss": 0.5551, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 2.6879999999999997, |
|
"grad_norm": 0.4178985357284546, |
|
"learning_rate": 9.357798165137616e-05, |
|
"loss": 0.5132, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.3729366958141327, |
|
"learning_rate": 7.98165137614679e-05, |
|
"loss": 0.4831, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 3.072, |
|
"grad_norm": 0.30623266100883484, |
|
"learning_rate": 6.605504587155963e-05, |
|
"loss": 0.4484, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.2640000000000002, |
|
"grad_norm": 0.31436726450920105, |
|
"learning_rate": 5.229357798165138e-05, |
|
"loss": 0.4255, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 3.456, |
|
"grad_norm": 0.32084089517593384, |
|
"learning_rate": 3.8532110091743125e-05, |
|
"loss": 0.3731, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 3.648, |
|
"grad_norm": 0.3452220857143402, |
|
"learning_rate": 2.4770642201834864e-05, |
|
"loss": 0.3786, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 0.3483213782310486, |
|
"learning_rate": 1.1009174311926607e-05, |
|
"loss": 0.3988, |
|
"step": 300 |
|
} |
|
], |
|
"logging_steps": 15, |
|
"max_steps": 312, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.447199037200794e+16, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|