longt5_xl_sfd_4096_e10 / trainer_state.json
learn3r's picture
End of training
114d833 verified
{
"best_metric": 2.3254711627960205,
"best_model_checkpoint": "/home/co-ou1/rds/hpc-work/models/longt5_xl_sfd_4096/longt5_xl_sfd_4096_e10/checkpoint-28",
"epoch": 9.73913043478261,
"eval_steps": 500,
"global_step": 140,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.14,
"learning_rate": 0.001,
"loss": 3.2585,
"step": 2
},
{
"epoch": 0.28,
"learning_rate": 0.001,
"loss": 3.3242,
"step": 4
},
{
"epoch": 0.42,
"learning_rate": 0.001,
"loss": 3.6288,
"step": 6
},
{
"epoch": 0.56,
"learning_rate": 0.001,
"loss": 3.1948,
"step": 8
},
{
"epoch": 0.7,
"learning_rate": 0.001,
"loss": 2.9524,
"step": 10
},
{
"epoch": 0.83,
"learning_rate": 0.001,
"loss": 2.8889,
"step": 12
},
{
"epoch": 0.97,
"learning_rate": 0.001,
"loss": 3.0332,
"step": 14
},
{
"epoch": 0.97,
"eval_loss": 2.5423872470855713,
"eval_runtime": 96.2671,
"eval_samples_per_second": 3.511,
"eval_steps_per_second": 0.447,
"step": 14
},
{
"epoch": 1.11,
"learning_rate": 0.001,
"loss": 2.6074,
"step": 16
},
{
"epoch": 1.25,
"learning_rate": 0.001,
"loss": 2.6808,
"step": 18
},
{
"epoch": 1.39,
"learning_rate": 0.001,
"loss": 2.6085,
"step": 20
},
{
"epoch": 1.53,
"learning_rate": 0.001,
"loss": 2.5063,
"step": 22
},
{
"epoch": 1.67,
"learning_rate": 0.001,
"loss": 2.4614,
"step": 24
},
{
"epoch": 1.81,
"learning_rate": 0.001,
"loss": 2.4519,
"step": 26
},
{
"epoch": 1.95,
"learning_rate": 0.001,
"loss": 2.4105,
"step": 28
},
{
"epoch": 1.95,
"eval_loss": 2.3254711627960205,
"eval_runtime": 99.9342,
"eval_samples_per_second": 3.382,
"eval_steps_per_second": 0.43,
"step": 28
},
{
"epoch": 2.09,
"learning_rate": 0.001,
"loss": 2.1628,
"step": 30
},
{
"epoch": 2.23,
"learning_rate": 0.001,
"loss": 2.0701,
"step": 32
},
{
"epoch": 2.37,
"learning_rate": 0.001,
"loss": 2.0992,
"step": 34
},
{
"epoch": 2.5,
"learning_rate": 0.001,
"loss": 2.0401,
"step": 36
},
{
"epoch": 2.64,
"learning_rate": 0.001,
"loss": 2.0299,
"step": 38
},
{
"epoch": 2.78,
"learning_rate": 0.001,
"loss": 2.0812,
"step": 40
},
{
"epoch": 2.92,
"learning_rate": 0.001,
"loss": 2.0496,
"step": 42
},
{
"epoch": 2.99,
"eval_loss": 2.3419768810272217,
"eval_runtime": 99.9699,
"eval_samples_per_second": 3.381,
"eval_steps_per_second": 0.43,
"step": 43
},
{
"epoch": 3.06,
"learning_rate": 0.001,
"loss": 1.9994,
"step": 44
},
{
"epoch": 3.2,
"learning_rate": 0.001,
"loss": 1.7276,
"step": 46
},
{
"epoch": 3.34,
"learning_rate": 0.001,
"loss": 1.7639,
"step": 48
},
{
"epoch": 3.48,
"learning_rate": 0.001,
"loss": 1.7624,
"step": 50
},
{
"epoch": 3.62,
"learning_rate": 0.001,
"loss": 1.7726,
"step": 52
},
{
"epoch": 3.76,
"learning_rate": 0.001,
"loss": 1.7218,
"step": 54
},
{
"epoch": 3.9,
"learning_rate": 0.001,
"loss": 1.7473,
"step": 56
},
{
"epoch": 3.97,
"eval_loss": 2.352036476135254,
"eval_runtime": 109.1375,
"eval_samples_per_second": 3.097,
"eval_steps_per_second": 0.394,
"step": 57
},
{
"epoch": 4.03,
"learning_rate": 0.001,
"loss": 1.6586,
"step": 58
},
{
"epoch": 4.17,
"learning_rate": 0.001,
"loss": 1.3888,
"step": 60
},
{
"epoch": 4.31,
"learning_rate": 0.001,
"loss": 1.4192,
"step": 62
},
{
"epoch": 4.45,
"learning_rate": 0.001,
"loss": 1.4003,
"step": 64
},
{
"epoch": 4.59,
"learning_rate": 0.001,
"loss": 1.4405,
"step": 66
},
{
"epoch": 4.73,
"learning_rate": 0.001,
"loss": 1.3766,
"step": 68
},
{
"epoch": 4.87,
"learning_rate": 0.001,
"loss": 1.4007,
"step": 70
},
{
"epoch": 4.94,
"eval_loss": 2.4979982376098633,
"eval_runtime": 101.842,
"eval_samples_per_second": 3.319,
"eval_steps_per_second": 0.422,
"step": 71
},
{
"epoch": 5.01,
"learning_rate": 0.001,
"loss": 1.3547,
"step": 72
},
{
"epoch": 5.15,
"learning_rate": 0.001,
"loss": 1.3243,
"step": 74
},
{
"epoch": 5.29,
"learning_rate": 0.001,
"loss": 1.3494,
"step": 76
},
{
"epoch": 5.43,
"learning_rate": 0.001,
"loss": 1.3982,
"step": 78
},
{
"epoch": 5.57,
"learning_rate": 0.001,
"loss": 1.3294,
"step": 80
},
{
"epoch": 5.7,
"learning_rate": 0.001,
"loss": 1.404,
"step": 82
},
{
"epoch": 5.84,
"learning_rate": 0.001,
"loss": 1.371,
"step": 84
},
{
"epoch": 5.98,
"learning_rate": 0.001,
"loss": 1.3809,
"step": 86
},
{
"epoch": 5.98,
"eval_loss": 2.4785053730010986,
"eval_runtime": 78.2069,
"eval_samples_per_second": 4.322,
"eval_steps_per_second": 0.55,
"step": 86
},
{
"epoch": 6.12,
"learning_rate": 0.001,
"loss": 1.0798,
"step": 88
},
{
"epoch": 6.26,
"learning_rate": 0.001,
"loss": 1.0476,
"step": 90
},
{
"epoch": 6.4,
"learning_rate": 0.001,
"loss": 1.111,
"step": 92
},
{
"epoch": 6.54,
"learning_rate": 0.001,
"loss": 1.0734,
"step": 94
},
{
"epoch": 6.68,
"learning_rate": 0.001,
"loss": 1.0563,
"step": 96
},
{
"epoch": 6.82,
"learning_rate": 0.001,
"loss": 1.1215,
"step": 98
},
{
"epoch": 6.96,
"learning_rate": 0.001,
"loss": 1.1153,
"step": 100
},
{
"epoch": 6.96,
"eval_loss": 2.732642650604248,
"eval_runtime": 78.0611,
"eval_samples_per_second": 4.33,
"eval_steps_per_second": 0.551,
"step": 100
},
{
"epoch": 7.1,
"learning_rate": 0.001,
"loss": 0.9032,
"step": 102
},
{
"epoch": 7.23,
"learning_rate": 0.001,
"loss": 0.8517,
"step": 104
},
{
"epoch": 7.37,
"learning_rate": 0.001,
"loss": 0.8711,
"step": 106
},
{
"epoch": 7.51,
"learning_rate": 0.001,
"loss": 0.8849,
"step": 108
},
{
"epoch": 7.65,
"learning_rate": 0.001,
"loss": 0.898,
"step": 110
},
{
"epoch": 7.79,
"learning_rate": 0.001,
"loss": 0.9153,
"step": 112
},
{
"epoch": 7.93,
"learning_rate": 0.001,
"loss": 0.9129,
"step": 114
},
{
"epoch": 8.0,
"eval_loss": 2.9232447147369385,
"eval_runtime": 78.4249,
"eval_samples_per_second": 4.31,
"eval_steps_per_second": 0.548,
"step": 115
},
{
"epoch": 8.07,
"learning_rate": 0.001,
"loss": 0.7377,
"step": 116
},
{
"epoch": 8.21,
"learning_rate": 0.001,
"loss": 0.6558,
"step": 118
},
{
"epoch": 8.35,
"learning_rate": 0.001,
"loss": 0.7047,
"step": 120
},
{
"epoch": 8.49,
"learning_rate": 0.001,
"loss": 0.7382,
"step": 122
},
{
"epoch": 8.63,
"learning_rate": 0.001,
"loss": 0.6919,
"step": 124
},
{
"epoch": 8.77,
"learning_rate": 0.001,
"loss": 0.7257,
"step": 126
},
{
"epoch": 8.9,
"learning_rate": 0.001,
"loss": 0.7118,
"step": 128
},
{
"epoch": 8.97,
"eval_loss": 3.047579526901245,
"eval_runtime": 78.168,
"eval_samples_per_second": 4.324,
"eval_steps_per_second": 0.55,
"step": 129
},
{
"epoch": 9.04,
"learning_rate": 0.001,
"loss": 0.6401,
"step": 130
},
{
"epoch": 9.18,
"learning_rate": 0.001,
"loss": 0.5032,
"step": 132
},
{
"epoch": 9.32,
"learning_rate": 0.001,
"loss": 0.548,
"step": 134
},
{
"epoch": 9.46,
"learning_rate": 0.001,
"loss": 0.5218,
"step": 136
},
{
"epoch": 9.6,
"learning_rate": 0.001,
"loss": 0.5744,
"step": 138
},
{
"epoch": 9.74,
"learning_rate": 0.001,
"loss": 0.5883,
"step": 140
},
{
"epoch": 9.74,
"eval_loss": 3.3644142150878906,
"eval_runtime": 78.3345,
"eval_samples_per_second": 4.315,
"eval_steps_per_second": 0.549,
"step": 140
},
{
"epoch": 9.74,
"step": 140,
"total_flos": 2.447850236380324e+18,
"train_loss": 1.5745136559009552,
"train_runtime": 36489.2213,
"train_samples_per_second": 1.007,
"train_steps_per_second": 0.004
}
],
"logging_steps": 2,
"max_steps": 140,
"num_train_epochs": 10,
"save_steps": 500,
"total_flos": 2.447850236380324e+18,
"trial_name": null,
"trial_params": null
}