|
{ |
|
"best_metric": 80.63612438300382, |
|
"best_model_checkpoint": "/root/turkic_qa/tr_uzn_models/tr_uzn_xlm_roberta_base_squad_model/checkpoint-8120", |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 8120, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.0, |
|
"step": 812, |
|
"train_exact_match": 56.743256743256744, |
|
"train_f1": 75.16234224627293, |
|
"train_runtime": 18.93, |
|
"train_samples_per_second": 87.692, |
|
"train_steps_per_second": 3.17 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 33.5258674621582, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3302, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_exact_match": 56.78125, |
|
"eval_f1": 74.92855180045365, |
|
"eval_runtime": 58.0842, |
|
"eval_samples_per_second": 87.993, |
|
"eval_steps_per_second": 3.151, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 1624, |
|
"train_exact_match": 66.53346653346654, |
|
"train_f1": 81.01468766554503, |
|
"train_runtime": 19.1594, |
|
"train_samples_per_second": 84.919, |
|
"train_steps_per_second": 3.079 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 39.375850677490234, |
|
"learning_rate": 1e-05, |
|
"loss": 0.9867, |
|
"step": 1624 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_exact_match": 60.71875, |
|
"eval_f1": 77.48042477059983, |
|
"eval_runtime": 59.4097, |
|
"eval_samples_per_second": 86.03, |
|
"eval_steps_per_second": 3.08, |
|
"step": 1624 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 2436, |
|
"train_exact_match": 73.32667332667333, |
|
"train_f1": 86.12936978278546, |
|
"train_runtime": 18.141, |
|
"train_samples_per_second": 87.04, |
|
"train_steps_per_second": 3.142 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 43.98362350463867, |
|
"learning_rate": 8.750000000000001e-06, |
|
"loss": 0.7954, |
|
"step": 2436 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_exact_match": 63.53125, |
|
"eval_f1": 79.36767777349903, |
|
"eval_runtime": 58.582, |
|
"eval_samples_per_second": 87.245, |
|
"eval_steps_per_second": 3.124, |
|
"step": 2436 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 3248, |
|
"train_exact_match": 74.72527472527473, |
|
"train_f1": 88.40189950318806, |
|
"train_runtime": 18.0666, |
|
"train_samples_per_second": 88.174, |
|
"train_steps_per_second": 3.155 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 28.4106502532959, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 0.6427, |
|
"step": 3248 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_exact_match": 65.21875, |
|
"eval_f1": 80.19939907948, |
|
"eval_runtime": 57.9356, |
|
"eval_samples_per_second": 88.219, |
|
"eval_steps_per_second": 3.159, |
|
"step": 3248 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 4060, |
|
"train_exact_match": 79.72027972027972, |
|
"train_f1": 90.39293523910672, |
|
"train_runtime": 18.0742, |
|
"train_samples_per_second": 88.303, |
|
"train_steps_per_second": 3.154 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 58.56404495239258, |
|
"learning_rate": 6.25e-06, |
|
"loss": 0.5287, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_exact_match": 64.53125, |
|
"eval_f1": 79.76982128782555, |
|
"eval_runtime": 57.84, |
|
"eval_samples_per_second": 88.365, |
|
"eval_steps_per_second": 3.164, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"step": 4872, |
|
"train_exact_match": 79.42057942057941, |
|
"train_f1": 91.11407011087398, |
|
"train_runtime": 18.4811, |
|
"train_samples_per_second": 88.252, |
|
"train_steps_per_second": 3.192 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 45.58940124511719, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4506, |
|
"step": 4872 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_exact_match": 65.46875, |
|
"eval_f1": 80.18984733785271, |
|
"eval_runtime": 58.0085, |
|
"eval_samples_per_second": 88.108, |
|
"eval_steps_per_second": 3.155, |
|
"step": 4872 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"step": 5684, |
|
"train_exact_match": 80.41958041958041, |
|
"train_f1": 91.05656126706158, |
|
"train_runtime": 17.8755, |
|
"train_samples_per_second": 89.172, |
|
"train_steps_per_second": 3.189 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 26.53660774230957, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 0.3805, |
|
"step": 5684 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_exact_match": 65.125, |
|
"eval_f1": 80.04186683186089, |
|
"eval_runtime": 57.4933, |
|
"eval_samples_per_second": 88.897, |
|
"eval_steps_per_second": 3.183, |
|
"step": 5684 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"step": 6496, |
|
"train_exact_match": 86.31368631368632, |
|
"train_f1": 93.75812824286764, |
|
"train_runtime": 17.4124, |
|
"train_samples_per_second": 88.328, |
|
"train_steps_per_second": 3.159 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 34.22230911254883, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.3381, |
|
"step": 6496 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_exact_match": 65.90625, |
|
"eval_f1": 80.14333737032665, |
|
"eval_runtime": 57.8402, |
|
"eval_samples_per_second": 88.364, |
|
"eval_steps_per_second": 3.164, |
|
"step": 6496 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"step": 7308, |
|
"train_exact_match": 86.01398601398601, |
|
"train_f1": 94.05045602201064, |
|
"train_runtime": 18.8091, |
|
"train_samples_per_second": 86.767, |
|
"train_steps_per_second": 3.137 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 87.17084503173828, |
|
"learning_rate": 1.25e-06, |
|
"loss": 0.3032, |
|
"step": 7308 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_exact_match": 66.21875, |
|
"eval_f1": 80.52702562432783, |
|
"eval_runtime": 58.1783, |
|
"eval_samples_per_second": 87.851, |
|
"eval_steps_per_second": 3.146, |
|
"step": 7308 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 8120, |
|
"train_exact_match": 86.41358641358642, |
|
"train_f1": 94.52312811018126, |
|
"train_runtime": 17.2785, |
|
"train_samples_per_second": 87.276, |
|
"train_steps_per_second": 3.125 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 31.117448806762695, |
|
"learning_rate": 0.0, |
|
"loss": 0.2815, |
|
"step": 8120 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_exact_match": 66.4375, |
|
"eval_f1": 80.63612438300382, |
|
"eval_runtime": 58.3935, |
|
"eval_samples_per_second": 87.527, |
|
"eval_steps_per_second": 3.134, |
|
"step": 8120 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 8120, |
|
"total_flos": 4.451320899376128e+16, |
|
"train_loss": 0.6037781062384544, |
|
"train_runtime": 5220.6695, |
|
"train_samples_per_second": 43.508, |
|
"train_steps_per_second": 1.555 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 8120, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 4.451320899376128e+16, |
|
"train_batch_size": 28, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|