longt5_xl_sfd_bp_10 / trainer_state.json
learn3r's picture
End of training
3f12c27 verified
raw
history blame contribute delete
No virus
19.4 kB
{
"best_metric": 1.5011385679244995,
"best_model_checkpoint": "/exports/eddie/scratch/s1970716/models/summarization/longt5_xl_sfd_bp_10/checkpoint-43",
"epoch": 19.47826086956522,
"eval_steps": 500,
"global_step": 280,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.14,
"learning_rate": 0.001,
"loss": 2.9672,
"step": 2
},
{
"epoch": 0.28,
"learning_rate": 0.001,
"loss": 3.0162,
"step": 4
},
{
"epoch": 0.42,
"learning_rate": 0.001,
"loss": 3.1689,
"step": 6
},
{
"epoch": 0.56,
"learning_rate": 0.001,
"loss": 2.902,
"step": 8
},
{
"epoch": 0.7,
"learning_rate": 0.001,
"loss": 2.4891,
"step": 10
},
{
"epoch": 0.83,
"learning_rate": 0.001,
"loss": 2.8498,
"step": 12
},
{
"epoch": 0.97,
"learning_rate": 0.001,
"loss": 2.3973,
"step": 14
},
{
"epoch": 0.97,
"eval_loss": 1.9026896953582764,
"eval_runtime": 84.3252,
"eval_samples_per_second": 4.008,
"eval_steps_per_second": 0.51,
"step": 14
},
{
"epoch": 1.11,
"learning_rate": 0.001,
"loss": 2.1921,
"step": 16
},
{
"epoch": 1.25,
"learning_rate": 0.001,
"loss": 2.0091,
"step": 18
},
{
"epoch": 1.39,
"learning_rate": 0.001,
"loss": 1.8884,
"step": 20
},
{
"epoch": 1.53,
"learning_rate": 0.001,
"loss": 1.7955,
"step": 22
},
{
"epoch": 1.67,
"learning_rate": 0.001,
"loss": 1.7023,
"step": 24
},
{
"epoch": 1.81,
"learning_rate": 0.001,
"loss": 1.8178,
"step": 26
},
{
"epoch": 1.95,
"learning_rate": 0.001,
"loss": 1.9188,
"step": 28
},
{
"epoch": 1.95,
"eval_loss": 1.694077968597412,
"eval_runtime": 84.303,
"eval_samples_per_second": 4.009,
"eval_steps_per_second": 0.51,
"step": 28
},
{
"epoch": 2.09,
"learning_rate": 0.001,
"loss": 1.6461,
"step": 30
},
{
"epoch": 2.23,
"learning_rate": 0.001,
"loss": 1.552,
"step": 32
},
{
"epoch": 2.37,
"learning_rate": 0.001,
"loss": 1.4914,
"step": 34
},
{
"epoch": 2.5,
"learning_rate": 0.001,
"loss": 1.457,
"step": 36
},
{
"epoch": 2.64,
"learning_rate": 0.001,
"loss": 1.4499,
"step": 38
},
{
"epoch": 2.78,
"learning_rate": 0.001,
"loss": 1.4868,
"step": 40
},
{
"epoch": 2.92,
"learning_rate": 0.001,
"loss": 1.4297,
"step": 42
},
{
"epoch": 2.99,
"eval_loss": 1.5011385679244995,
"eval_runtime": 84.2558,
"eval_samples_per_second": 4.012,
"eval_steps_per_second": 0.51,
"step": 43
},
{
"epoch": 3.06,
"learning_rate": 0.001,
"loss": 1.3757,
"step": 44
},
{
"epoch": 3.2,
"learning_rate": 0.001,
"loss": 1.2701,
"step": 46
},
{
"epoch": 3.34,
"learning_rate": 0.001,
"loss": 1.2826,
"step": 48
},
{
"epoch": 3.48,
"learning_rate": 0.001,
"loss": 1.2945,
"step": 50
},
{
"epoch": 3.62,
"learning_rate": 0.001,
"loss": 1.2963,
"step": 52
},
{
"epoch": 3.76,
"learning_rate": 0.001,
"loss": 1.2933,
"step": 54
},
{
"epoch": 3.9,
"learning_rate": 0.001,
"loss": 1.2759,
"step": 56
},
{
"epoch": 3.97,
"eval_loss": 1.5048083066940308,
"eval_runtime": 84.2663,
"eval_samples_per_second": 4.011,
"eval_steps_per_second": 0.51,
"step": 57
},
{
"epoch": 4.03,
"learning_rate": 0.001,
"loss": 1.2446,
"step": 58
},
{
"epoch": 4.17,
"learning_rate": 0.001,
"loss": 1.0992,
"step": 60
},
{
"epoch": 4.31,
"learning_rate": 0.001,
"loss": 1.0771,
"step": 62
},
{
"epoch": 4.45,
"learning_rate": 0.001,
"loss": 1.1254,
"step": 64
},
{
"epoch": 4.59,
"learning_rate": 0.001,
"loss": 1.1644,
"step": 66
},
{
"epoch": 4.73,
"learning_rate": 0.001,
"loss": 1.1485,
"step": 68
},
{
"epoch": 4.87,
"learning_rate": 0.001,
"loss": 1.1421,
"step": 70
},
{
"epoch": 4.94,
"eval_loss": 1.5463248491287231,
"eval_runtime": 84.2507,
"eval_samples_per_second": 4.012,
"eval_steps_per_second": 0.51,
"step": 71
},
{
"epoch": 5.01,
"learning_rate": 0.001,
"loss": 1.1233,
"step": 72
},
{
"epoch": 5.15,
"learning_rate": 0.001,
"loss": 0.8919,
"step": 74
},
{
"epoch": 5.29,
"learning_rate": 0.001,
"loss": 0.9349,
"step": 76
},
{
"epoch": 5.43,
"learning_rate": 0.001,
"loss": 0.9363,
"step": 78
},
{
"epoch": 5.57,
"learning_rate": 0.001,
"loss": 0.9203,
"step": 80
},
{
"epoch": 5.7,
"learning_rate": 0.001,
"loss": 0.9429,
"step": 82
},
{
"epoch": 5.84,
"learning_rate": 0.001,
"loss": 0.9495,
"step": 84
},
{
"epoch": 5.98,
"learning_rate": 0.001,
"loss": 0.9605,
"step": 86
},
{
"epoch": 5.98,
"eval_loss": 1.6269720792770386,
"eval_runtime": 84.2452,
"eval_samples_per_second": 4.012,
"eval_steps_per_second": 0.51,
"step": 86
},
{
"epoch": 6.12,
"learning_rate": 0.001,
"loss": 0.7747,
"step": 88
},
{
"epoch": 6.26,
"learning_rate": 0.001,
"loss": 0.7664,
"step": 90
},
{
"epoch": 6.4,
"learning_rate": 0.001,
"loss": 0.7998,
"step": 92
},
{
"epoch": 6.54,
"learning_rate": 0.001,
"loss": 0.7715,
"step": 94
},
{
"epoch": 6.68,
"learning_rate": 0.001,
"loss": 0.8038,
"step": 96
},
{
"epoch": 6.82,
"learning_rate": 0.001,
"loss": 0.8059,
"step": 98
},
{
"epoch": 6.96,
"learning_rate": 0.001,
"loss": 0.8082,
"step": 100
},
{
"epoch": 6.96,
"eval_loss": 1.7645633220672607,
"eval_runtime": 84.2691,
"eval_samples_per_second": 4.011,
"eval_steps_per_second": 0.51,
"step": 100
},
{
"epoch": 7.1,
"learning_rate": 0.001,
"loss": 0.6708,
"step": 102
},
{
"epoch": 7.23,
"learning_rate": 0.001,
"loss": 0.6186,
"step": 104
},
{
"epoch": 7.37,
"learning_rate": 0.001,
"loss": 0.6101,
"step": 106
},
{
"epoch": 7.51,
"learning_rate": 0.001,
"loss": 0.6328,
"step": 108
},
{
"epoch": 7.65,
"learning_rate": 0.001,
"loss": 0.6529,
"step": 110
},
{
"epoch": 7.79,
"learning_rate": 0.001,
"loss": 0.6312,
"step": 112
},
{
"epoch": 7.93,
"learning_rate": 0.001,
"loss": 0.664,
"step": 114
},
{
"epoch": 8.0,
"eval_loss": 1.78783118724823,
"eval_runtime": 84.2285,
"eval_samples_per_second": 4.013,
"eval_steps_per_second": 0.511,
"step": 115
},
{
"epoch": 8.07,
"learning_rate": 0.001,
"loss": 0.6177,
"step": 116
},
{
"epoch": 8.21,
"learning_rate": 0.001,
"loss": 0.5241,
"step": 118
},
{
"epoch": 8.35,
"learning_rate": 0.001,
"loss": 0.5173,
"step": 120
},
{
"epoch": 8.49,
"learning_rate": 0.001,
"loss": 0.5241,
"step": 122
},
{
"epoch": 8.63,
"learning_rate": 0.001,
"loss": 0.5546,
"step": 124
},
{
"epoch": 8.77,
"learning_rate": 0.001,
"loss": 0.5401,
"step": 126
},
{
"epoch": 8.9,
"learning_rate": 0.001,
"loss": 0.5471,
"step": 128
},
{
"epoch": 8.97,
"eval_loss": 1.9499958753585815,
"eval_runtime": 84.3243,
"eval_samples_per_second": 4.008,
"eval_steps_per_second": 0.51,
"step": 129
},
{
"epoch": 9.04,
"learning_rate": 0.001,
"loss": 0.526,
"step": 130
},
{
"epoch": 9.18,
"learning_rate": 0.001,
"loss": 0.409,
"step": 132
},
{
"epoch": 9.32,
"learning_rate": 0.001,
"loss": 0.4163,
"step": 134
},
{
"epoch": 9.46,
"learning_rate": 0.001,
"loss": 0.4304,
"step": 136
},
{
"epoch": 9.6,
"learning_rate": 0.001,
"loss": 0.4512,
"step": 138
},
{
"epoch": 9.74,
"learning_rate": 0.001,
"loss": 0.4396,
"step": 140
},
{
"epoch": 9.88,
"learning_rate": 0.001,
"loss": 0.4349,
"step": 142
},
{
"epoch": 9.95,
"eval_loss": 1.9656596183776855,
"eval_runtime": 84.2136,
"eval_samples_per_second": 4.014,
"eval_steps_per_second": 0.511,
"step": 143
},
{
"epoch": 10.02,
"learning_rate": 0.001,
"loss": 0.4077,
"step": 144
},
{
"epoch": 10.16,
"learning_rate": 0.001,
"loss": 0.3439,
"step": 146
},
{
"epoch": 10.3,
"learning_rate": 0.001,
"loss": 0.3503,
"step": 148
},
{
"epoch": 10.43,
"learning_rate": 0.001,
"loss": 0.3572,
"step": 150
},
{
"epoch": 10.57,
"learning_rate": 0.001,
"loss": 0.3643,
"step": 152
},
{
"epoch": 10.71,
"learning_rate": 0.001,
"loss": 0.3516,
"step": 154
},
{
"epoch": 10.85,
"learning_rate": 0.001,
"loss": 0.377,
"step": 156
},
{
"epoch": 10.99,
"learning_rate": 0.001,
"loss": 0.4338,
"step": 158
},
{
"epoch": 10.99,
"eval_loss": 2.135113477706909,
"eval_runtime": 84.161,
"eval_samples_per_second": 4.016,
"eval_steps_per_second": 0.511,
"step": 158
},
{
"epoch": 11.13,
"learning_rate": 0.001,
"loss": 0.2715,
"step": 160
},
{
"epoch": 11.27,
"learning_rate": 0.001,
"loss": 0.2391,
"step": 162
},
{
"epoch": 11.41,
"learning_rate": 0.001,
"loss": 0.2958,
"step": 164
},
{
"epoch": 11.55,
"learning_rate": 0.001,
"loss": 0.3101,
"step": 166
},
{
"epoch": 11.69,
"learning_rate": 0.001,
"loss": 0.3417,
"step": 168
},
{
"epoch": 11.83,
"learning_rate": 0.001,
"loss": 0.3292,
"step": 170
},
{
"epoch": 11.97,
"learning_rate": 0.001,
"loss": 0.2887,
"step": 172
},
{
"epoch": 11.97,
"eval_loss": 2.11661434173584,
"eval_runtime": 84.1912,
"eval_samples_per_second": 4.015,
"eval_steps_per_second": 0.511,
"step": 172
},
{
"epoch": 12.1,
"learning_rate": 0.001,
"loss": 0.2532,
"step": 174
},
{
"epoch": 12.24,
"learning_rate": 0.001,
"loss": 0.2565,
"step": 176
},
{
"epoch": 12.38,
"learning_rate": 0.001,
"loss": 0.2791,
"step": 178
},
{
"epoch": 12.52,
"learning_rate": 0.001,
"loss": 0.2803,
"step": 180
},
{
"epoch": 12.66,
"learning_rate": 0.001,
"loss": 0.3015,
"step": 182
},
{
"epoch": 12.8,
"learning_rate": 0.001,
"loss": 0.2764,
"step": 184
},
{
"epoch": 12.94,
"learning_rate": 0.001,
"loss": 0.2753,
"step": 186
},
{
"epoch": 12.94,
"eval_loss": 2.4357352256774902,
"eval_runtime": 84.1848,
"eval_samples_per_second": 4.015,
"eval_steps_per_second": 0.511,
"step": 186
},
{
"epoch": 13.08,
"learning_rate": 0.001,
"loss": 0.2563,
"step": 188
},
{
"epoch": 13.22,
"learning_rate": 0.001,
"loss": 0.2024,
"step": 190
},
{
"epoch": 13.36,
"learning_rate": 0.001,
"loss": 0.2252,
"step": 192
},
{
"epoch": 13.5,
"learning_rate": 0.001,
"loss": 0.2487,
"step": 194
},
{
"epoch": 13.63,
"learning_rate": 0.001,
"loss": 0.2086,
"step": 196
},
{
"epoch": 13.77,
"learning_rate": 0.001,
"loss": 0.2181,
"step": 198
},
{
"epoch": 13.91,
"learning_rate": 0.001,
"loss": 0.2114,
"step": 200
},
{
"epoch": 13.98,
"eval_loss": 2.5789217948913574,
"eval_runtime": 84.14,
"eval_samples_per_second": 4.017,
"eval_steps_per_second": 0.511,
"step": 201
},
{
"epoch": 14.05,
"learning_rate": 0.001,
"loss": 0.1828,
"step": 202
},
{
"epoch": 14.19,
"learning_rate": 0.001,
"loss": 0.2025,
"step": 204
},
{
"epoch": 14.33,
"learning_rate": 0.001,
"loss": 0.1991,
"step": 206
},
{
"epoch": 14.47,
"learning_rate": 0.001,
"loss": 0.1844,
"step": 208
},
{
"epoch": 14.61,
"learning_rate": 0.001,
"loss": 0.1934,
"step": 210
},
{
"epoch": 14.75,
"learning_rate": 0.001,
"loss": 0.2,
"step": 212
},
{
"epoch": 14.89,
"learning_rate": 0.001,
"loss": 0.1805,
"step": 214
},
{
"epoch": 14.96,
"eval_loss": 2.6075170040130615,
"eval_runtime": 84.2554,
"eval_samples_per_second": 4.012,
"eval_steps_per_second": 0.51,
"step": 215
},
{
"epoch": 15.03,
"learning_rate": 0.001,
"loss": 0.1739,
"step": 216
},
{
"epoch": 15.17,
"learning_rate": 0.001,
"loss": 0.1504,
"step": 218
},
{
"epoch": 15.3,
"learning_rate": 0.001,
"loss": 0.1431,
"step": 220
},
{
"epoch": 15.44,
"learning_rate": 0.001,
"loss": 0.152,
"step": 222
},
{
"epoch": 15.58,
"learning_rate": 0.001,
"loss": 0.142,
"step": 224
},
{
"epoch": 15.72,
"learning_rate": 0.001,
"loss": 0.145,
"step": 226
},
{
"epoch": 15.86,
"learning_rate": 0.001,
"loss": 0.1476,
"step": 228
},
{
"epoch": 16.0,
"learning_rate": 0.001,
"loss": 0.1543,
"step": 230
},
{
"epoch": 16.0,
"eval_loss": 2.5597331523895264,
"eval_runtime": 84.1535,
"eval_samples_per_second": 4.016,
"eval_steps_per_second": 0.511,
"step": 230
},
{
"epoch": 16.14,
"learning_rate": 0.001,
"loss": 0.1245,
"step": 232
},
{
"epoch": 16.28,
"learning_rate": 0.001,
"loss": 0.1204,
"step": 234
},
{
"epoch": 16.42,
"learning_rate": 0.001,
"loss": 0.2117,
"step": 236
},
{
"epoch": 16.56,
"learning_rate": 0.001,
"loss": 0.6894,
"step": 238
},
{
"epoch": 16.7,
"learning_rate": 0.001,
"loss": 0.35,
"step": 240
},
{
"epoch": 16.83,
"learning_rate": 0.001,
"loss": 0.8395,
"step": 242
},
{
"epoch": 16.97,
"learning_rate": 0.001,
"loss": 0.5166,
"step": 244
},
{
"epoch": 16.97,
"eval_loss": 2.5066745281219482,
"eval_runtime": 84.1866,
"eval_samples_per_second": 4.015,
"eval_steps_per_second": 0.511,
"step": 244
},
{
"epoch": 17.11,
"learning_rate": 0.001,
"loss": 0.202,
"step": 246
},
{
"epoch": 17.25,
"learning_rate": 0.001,
"loss": 0.1247,
"step": 248
},
{
"epoch": 17.39,
"learning_rate": 0.001,
"loss": 0.1368,
"step": 250
},
{
"epoch": 17.53,
"learning_rate": 0.001,
"loss": 0.1096,
"step": 252
},
{
"epoch": 17.67,
"learning_rate": 0.001,
"loss": 0.1066,
"step": 254
},
{
"epoch": 17.81,
"learning_rate": 0.001,
"loss": 0.1078,
"step": 256
},
{
"epoch": 17.95,
"learning_rate": 0.001,
"loss": 0.1117,
"step": 258
},
{
"epoch": 17.95,
"eval_loss": 2.808701515197754,
"eval_runtime": 84.1837,
"eval_samples_per_second": 4.015,
"eval_steps_per_second": 0.511,
"step": 258
},
{
"epoch": 18.09,
"learning_rate": 0.001,
"loss": 0.0934,
"step": 260
},
{
"epoch": 18.23,
"learning_rate": 0.001,
"loss": 0.0793,
"step": 262
},
{
"epoch": 18.37,
"learning_rate": 0.001,
"loss": 0.0887,
"step": 264
},
{
"epoch": 18.5,
"learning_rate": 0.001,
"loss": 0.103,
"step": 266
},
{
"epoch": 18.64,
"learning_rate": 0.001,
"loss": 0.0847,
"step": 268
},
{
"epoch": 18.78,
"learning_rate": 0.001,
"loss": 0.0869,
"step": 270
},
{
"epoch": 18.92,
"learning_rate": 0.001,
"loss": 0.0895,
"step": 272
},
{
"epoch": 18.99,
"eval_loss": 2.7578108310699463,
"eval_runtime": 84.2046,
"eval_samples_per_second": 4.014,
"eval_steps_per_second": 0.511,
"step": 273
},
{
"epoch": 19.06,
"learning_rate": 0.001,
"loss": 0.0884,
"step": 274
},
{
"epoch": 19.2,
"learning_rate": 0.001,
"loss": 0.0838,
"step": 276
},
{
"epoch": 19.34,
"learning_rate": 0.001,
"loss": 0.0731,
"step": 278
},
{
"epoch": 19.48,
"learning_rate": 0.001,
"loss": 0.0779,
"step": 280
},
{
"epoch": 19.48,
"eval_loss": 2.892078161239624,
"eval_runtime": 84.2063,
"eval_samples_per_second": 4.014,
"eval_steps_per_second": 0.511,
"step": 280
},
{
"epoch": 19.48,
"step": 280,
"total_flos": 4.895224157149471e+18,
"train_loss": 0.722327525381531,
"train_runtime": 70402.3599,
"train_samples_per_second": 1.043,
"train_steps_per_second": 0.004
}
],
"logging_steps": 2,
"max_steps": 280,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 500,
"total_flos": 4.895224157149471e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}