|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.584507042253521, |
|
"eval_steps": 500, |
|
"global_step": 1800, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0880281690140845, |
|
"grad_norm": 1221.827392578125, |
|
"learning_rate": 2.7777777777777783e-06, |
|
"loss": 297.1973, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.176056338028169, |
|
"grad_norm": 1063.7159423828125, |
|
"learning_rate": 4.998119881260576e-06, |
|
"loss": 161.9498, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2640845070422535, |
|
"grad_norm": 1415.1790771484375, |
|
"learning_rate": 4.93261217644956e-06, |
|
"loss": 149.2985, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.352112676056338, |
|
"grad_norm": 1143.650634765625, |
|
"learning_rate": 4.775907352415367e-06, |
|
"loss": 146.5141, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.44014084507042256, |
|
"grad_norm": 1112.0836181640625, |
|
"learning_rate": 4.533880175657419e-06, |
|
"loss": 125.0505, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.528169014084507, |
|
"grad_norm": 2647.099365234375, |
|
"learning_rate": 4.215604094671835e-06, |
|
"loss": 136.0549, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6161971830985915, |
|
"grad_norm": 914.672119140625, |
|
"learning_rate": 3.833011082004229e-06, |
|
"loss": 152.9934, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.704225352112676, |
|
"grad_norm": 941.3635864257812, |
|
"learning_rate": 3.400444312011776e-06, |
|
"loss": 142.6481, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.7922535211267606, |
|
"grad_norm": 2750.364990234375, |
|
"learning_rate": 2.9341204441673267e-06, |
|
"loss": 180.6238, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.8802816901408451, |
|
"grad_norm": 1102.6456298828125, |
|
"learning_rate": 2.4515216705704396e-06, |
|
"loss": 160.7811, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.9683098591549296, |
|
"grad_norm": 2127.451416015625, |
|
"learning_rate": 1.970740319426474e-06, |
|
"loss": 136.9254, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 415.93927001953125, |
|
"eval_runtime": 8.7817, |
|
"eval_samples_per_second": 115.012, |
|
"eval_steps_per_second": 14.462, |
|
"step": 1136 |
|
}, |
|
{ |
|
"epoch": 1.056338028169014, |
|
"grad_norm": 2116.73193359375, |
|
"learning_rate": 1.509800584902108e-06, |
|
"loss": 142.92, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.1443661971830985, |
|
"grad_norm": 1667.754638671875, |
|
"learning_rate": 1.085982811283654e-06, |
|
"loss": 141.8554, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.232394366197183, |
|
"grad_norm": 1366.0006103515625, |
|
"learning_rate": 7.151756636052529e-07, |
|
"loss": 149.8122, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.3204225352112675, |
|
"grad_norm": 1805.1434326171875, |
|
"learning_rate": 4.1128047146765936e-07, |
|
"loss": 148.2215, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.408450704225352, |
|
"grad_norm": 1603.68212890625, |
|
"learning_rate": 1.8569007682777417e-07, |
|
"loss": 142.1948, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.4964788732394365, |
|
"grad_norm": 1250.8704833984375, |
|
"learning_rate": 4.6861723431538273e-08, |
|
"loss": 123.6116, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.584507042253521, |
|
"grad_norm": 3193.298828125, |
|
"learning_rate": 0.0, |
|
"loss": 142.5536, |
|
"step": 1800 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 1800, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 600, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|