learn3r commited on
Commit
e811081
1 Parent(s): 0f84021

End of training

Browse files
Files changed (5) hide show
  1. README.md +21 -8
  2. all_results.json +14 -14
  3. eval_results.json +10 -10
  4. train_results.json +5 -5
  5. trainer_state.json +718 -168
README.md CHANGED
@@ -1,11 +1,24 @@
1
  ---
 
2
  tags:
3
  - generated_from_trainer
 
 
4
  metrics:
5
  - rouge
6
  model-index:
7
  - name: longt5_xl_summ_screen_bp_only_30
8
- results: []
 
 
 
 
 
 
 
 
 
 
9
  ---
10
 
11
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -13,14 +26,14 @@ should probably proofread and complete it, then remove this comment. -->
13
 
14
  # longt5_xl_summ_screen_bp_only_30
15
 
16
- This model was trained from scratch on an unknown dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 3.4990
19
- - Rouge1: 32.5815
20
- - Rouge2: 14.2951
21
- - Rougel: 22.4501
22
- - Rougelsum: 31.2928
23
- - Gen Len: 499.3107
24
 
25
  ## Model description
26
 
 
1
  ---
2
+ base_model: /exports/eddie/scratch/s1970716/models/summarization/longt5_xl_summ_screen_bp_only/checkpoint-210
3
  tags:
4
  - generated_from_trainer
5
+ datasets:
6
+ - learn3r/summ_screen_fd_bp
7
  metrics:
8
  - rouge
9
  model-index:
10
  - name: longt5_xl_summ_screen_bp_only_30
11
+ results:
12
+ - task:
13
+ name: Summarization
14
+ type: summarization
15
+ dataset:
16
+ name: learn3r/summ_screen_fd_bp
17
+ type: learn3r/summ_screen_fd_bp
18
+ metrics:
19
+ - name: Rouge1
20
+ type: rouge
21
+ value: 40.4388
22
  ---
23
 
24
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
26
 
27
  # longt5_xl_summ_screen_bp_only_30
28
 
29
+ This model is a fine-tuned version of [/exports/eddie/scratch/s1970716/models/summarization/longt5_xl_summ_screen_bp_only/checkpoint-210](https://huggingface.co//exports/eddie/scratch/s1970716/models/summarization/longt5_xl_summ_screen_bp_only/checkpoint-210) on the learn3r/summ_screen_fd_bp dataset.
30
  It achieves the following results on the evaluation set:
31
+ - Loss: 2.2376
32
+ - Rouge1: 40.4388
33
+ - Rouge2: 16.4662
34
+ - Rougel: 28.0771
35
+ - Rougelsum: 38.3405
36
+ - Gen Len: 246.7396
37
 
38
  ## Model description
39
 
all_results.json CHANGED
@@ -1,18 +1,18 @@
1
  {
2
- "epoch": 13.18,
3
- "eval_gen_len": 246.3491124260355,
4
- "eval_loss": 2.2397100925445557,
5
- "eval_rouge1": 40.4943,
6
- "eval_rouge2": 16.4695,
7
- "eval_rougeL": 28.0964,
8
- "eval_rougeLsum": 38.3693,
9
- "eval_runtime": 467.1505,
10
  "eval_samples": 338,
11
- "eval_samples_per_second": 0.724,
12
- "eval_steps_per_second": 0.024,
13
- "train_loss": 0.16495737830797833,
14
- "train_runtime": 12728.0808,
15
  "train_samples": 3673,
16
- "train_samples_per_second": 4.329,
17
- "train_steps_per_second": 0.004
18
  }
 
1
  {
2
+ "epoch": 14.61,
3
+ "eval_gen_len": 246.7396449704142,
4
+ "eval_loss": 2.237640380859375,
5
+ "eval_rouge1": 40.4388,
6
+ "eval_rouge2": 16.4662,
7
+ "eval_rougeL": 28.0771,
8
+ "eval_rougeLsum": 38.3405,
9
+ "eval_runtime": 1708.4379,
10
  "eval_samples": 338,
11
+ "eval_samples_per_second": 0.198,
12
+ "eval_steps_per_second": 0.025,
13
+ "train_loss": 0.13047422234501158,
14
+ "train_runtime": 78176.4332,
15
  "train_samples": 3673,
16
+ "train_samples_per_second": 0.705,
17
+ "train_steps_per_second": 0.003
18
  }
eval_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 13.18,
3
- "eval_gen_len": 246.3491124260355,
4
- "eval_loss": 2.2397100925445557,
5
- "eval_rouge1": 40.4943,
6
- "eval_rouge2": 16.4695,
7
- "eval_rougeL": 28.0964,
8
- "eval_rougeLsum": 38.3693,
9
- "eval_runtime": 467.1505,
10
  "eval_samples": 338,
11
- "eval_samples_per_second": 0.724,
12
- "eval_steps_per_second": 0.024
13
  }
 
1
  {
2
+ "epoch": 14.61,
3
+ "eval_gen_len": 246.7396449704142,
4
+ "eval_loss": 2.237640380859375,
5
+ "eval_rouge1": 40.4388,
6
+ "eval_rouge2": 16.4662,
7
+ "eval_rougeL": 28.0771,
8
+ "eval_rougeLsum": 38.3405,
9
+ "eval_runtime": 1708.4379,
10
  "eval_samples": 338,
11
+ "eval_samples_per_second": 0.198,
12
+ "eval_steps_per_second": 0.025
13
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 13.18,
3
- "train_loss": 0.16495737830797833,
4
- "train_runtime": 12728.0808,
5
  "train_samples": 3673,
6
- "train_samples_per_second": 4.329,
7
- "train_steps_per_second": 0.004
8
  }
 
1
  {
2
+ "epoch": 14.61,
3
+ "train_loss": 0.13047422234501158,
4
+ "train_runtime": 78176.4332,
5
  "train_samples": 3673,
6
+ "train_samples_per_second": 0.705,
7
+ "train_steps_per_second": 0.003
8
  }
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 2.237640380859375,
3
  "best_model_checkpoint": "/exports/eddie/scratch/s1970716/models/summarization/longt5_xl_summ_screen_bp_only_30/checkpoint-14",
4
- "epoch": 13.182608695652174,
5
  "eval_steps": 500,
6
- "global_step": 45,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -58,246 +58,796 @@
58
  "eval_rouge2": 16.4662,
59
  "eval_rougeL": 28.0771,
60
  "eval_rougeLsum": 38.3405,
61
- "eval_runtime": 1741.7742,
62
- "eval_samples_per_second": 0.194,
63
  "eval_steps_per_second": 0.025,
64
  "step": 14
65
  },
66
  {
67
- "epoch": 4.83,
68
- "eval_gen_len": 503.56508875739644,
69
- "eval_loss": 2.572709798812866,
70
- "eval_rouge1": 30.0123,
71
- "eval_rouge2": 12.3701,
72
- "eval_rougeL": 21.2834,
73
- "eval_rougeLsum": 28.891,
74
- "eval_runtime": 472.1582,
75
- "eval_samples_per_second": 0.716,
76
- "eval_steps_per_second": 0.023,
77
- "step": 15
78
- },
79
- {
80
- "epoch": 5.11,
81
  "learning_rate": 0.0005,
82
- "loss": 0.278,
83
  "step": 16
84
  },
85
  {
86
- "epoch": 5.67,
87
  "learning_rate": 0.0005,
88
- "loss": 0.3036,
89
  "step": 18
90
  },
91
  {
92
- "epoch": 5.95,
93
- "eval_gen_len": 506.9526627218935,
94
- "eval_loss": 2.2658941745758057,
95
- "eval_rouge1": 27.8421,
96
- "eval_rouge2": 11.1942,
97
- "eval_rougeL": 20.4713,
98
- "eval_rougeLsum": 26.6097,
99
- "eval_runtime": 472.9998,
100
- "eval_samples_per_second": 0.715,
101
- "eval_steps_per_second": 0.023,
102
- "step": 19
103
- },
104
- {
105
- "epoch": 6.23,
106
  "learning_rate": 0.0005,
107
- "loss": 0.2937,
108
  "step": 20
109
  },
110
  {
111
- "epoch": 6.78,
112
  "learning_rate": 0.0005,
113
- "loss": 0.2941,
114
  "step": 22
115
  },
116
  {
117
- "epoch": 6.78,
118
- "eval_gen_len": 284.2307692307692,
119
- "eval_loss": 2.2636122703552246,
120
- "eval_rouge1": 40.8304,
121
- "eval_rouge2": 17.3615,
122
- "eval_rougeL": 28.0971,
123
- "eval_rougeLsum": 39.0943,
124
- "eval_runtime": 468.1667,
125
- "eval_samples_per_second": 0.722,
126
- "eval_steps_per_second": 0.023,
127
- "step": 22
128
- },
129
- {
130
- "epoch": 7.34,
131
  "learning_rate": 0.0005,
132
- "loss": 0.2508,
133
  "step": 24
134
  },
135
  {
136
- "epoch": 7.9,
137
  "learning_rate": 0.0005,
138
- "loss": 0.2642,
139
- "step": 26
140
- },
141
- {
142
- "epoch": 7.9,
143
- "eval_gen_len": 341.25147928994085,
144
- "eval_loss": 2.286357879638672,
145
- "eval_rouge1": 38.3377,
146
- "eval_rouge2": 15.8119,
147
- "eval_rougeL": 26.4838,
148
- "eval_rougeLsum": 36.5174,
149
- "eval_runtime": 469.1516,
150
- "eval_samples_per_second": 0.72,
151
- "eval_steps_per_second": 0.023,
152
  "step": 26
153
  },
154
  {
155
- "epoch": 8.45,
156
  "learning_rate": 0.0005,
157
- "loss": 0.2604,
158
  "step": 28
159
  },
160
  {
161
- "epoch": 8.73,
162
- "eval_gen_len": 435.26331360946745,
163
- "eval_loss": 2.455064535140991,
164
- "eval_rouge1": 33.2021,
165
- "eval_rouge2": 13.6577,
166
- "eval_rougeL": 23.3288,
167
- "eval_rougeLsum": 31.8326,
168
- "eval_runtime": 471.959,
169
- "eval_samples_per_second": 0.716,
170
- "eval_steps_per_second": 0.023,
171
- "step": 29
172
  },
173
  {
174
- "epoch": 9.01,
175
  "learning_rate": 0.0005,
176
- "loss": 0.2422,
177
  "step": 30
178
  },
179
  {
180
- "epoch": 9.57,
181
  "learning_rate": 0.0005,
182
- "loss": 0.2237,
183
  "step": 32
184
  },
185
  {
186
- "epoch": 9.84,
187
- "eval_gen_len": 234.61242603550295,
188
- "eval_loss": 2.615253210067749,
189
- "eval_rouge1": 40.3297,
190
- "eval_rouge2": 15.3786,
191
- "eval_rougeL": 28.1208,
192
- "eval_rougeLsum": 38.2426,
193
- "eval_runtime": 470.1581,
194
- "eval_samples_per_second": 0.719,
195
- "eval_steps_per_second": 0.023,
196
- "step": 33
197
- },
198
- {
199
- "epoch": 10.12,
200
  "learning_rate": 0.0005,
201
- "loss": 0.2184,
202
  "step": 34
203
  },
204
  {
205
- "epoch": 10.68,
206
  "learning_rate": 0.0005,
207
- "loss": 0.1904,
208
  "step": 36
209
  },
210
  {
211
- "epoch": 10.96,
212
- "eval_gen_len": 174.57396449704143,
213
- "eval_loss": 2.666517734527588,
214
- "eval_rouge1": 39.6006,
215
- "eval_rouge2": 14.9586,
216
- "eval_rougeL": 27.2453,
217
- "eval_rougeLsum": 37.6744,
218
- "eval_runtime": 450.9467,
219
- "eval_samples_per_second": 0.75,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  "eval_steps_per_second": 0.024,
221
- "step": 37
222
  },
223
  {
224
- "epoch": 11.23,
225
  "learning_rate": 0.0005,
226
- "loss": 0.2076,
227
- "step": 38
228
  },
229
  {
230
- "epoch": 11.79,
231
  "learning_rate": 0.0005,
232
- "loss": 0.2247,
233
- "step": 40
234
  },
235
  {
236
- "epoch": 11.79,
237
- "eval_gen_len": 500.58284023668637,
238
- "eval_loss": 2.722362518310547,
239
- "eval_rouge1": 30.5957,
240
- "eval_rouge2": 13.3496,
241
- "eval_rougeL": 21.9712,
242
- "eval_rougeLsum": 29.22,
243
- "eval_runtime": 477.6586,
244
- "eval_samples_per_second": 0.708,
245
- "eval_steps_per_second": 0.023,
246
- "step": 40
247
  },
248
  {
249
- "epoch": 12.35,
250
  "learning_rate": 0.0005,
251
- "loss": 0.1866,
252
- "step": 42
253
  },
254
  {
255
- "epoch": 12.9,
256
  "learning_rate": 0.0005,
257
- "loss": 0.182,
258
- "step": 44
259
  },
260
  {
261
- "epoch": 12.9,
262
- "eval_gen_len": 259.6568047337278,
263
- "eval_loss": 3.271503448486328,
264
- "eval_rouge1": 41.6828,
265
- "eval_rouge2": 17.0818,
266
- "eval_rougeL": 28.087,
267
- "eval_rougeLsum": 39.5947,
268
- "eval_runtime": 468.2637,
269
- "eval_samples_per_second": 0.722,
270
- "eval_steps_per_second": 0.023,
271
- "step": 44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  },
273
  {
274
- "epoch": 13.18,
275
- "eval_gen_len": 484.396449704142,
276
- "eval_loss": 2.397312641143799,
277
- "eval_rouge1": 31.9833,
278
- "eval_rouge2": 14.0141,
279
- "eval_rougeL": 22.6823,
280
- "eval_rougeLsum": 30.6424,
281
- "eval_runtime": 471.2439,
282
- "eval_samples_per_second": 0.717,
283
- "eval_steps_per_second": 0.023,
284
- "step": 45
285
- },
286
- {
287
- "epoch": 13.18,
288
- "step": 45,
289
- "total_flos": 2.4232217682745754e+18,
290
- "train_loss": 0.16495737830797833,
291
- "train_runtime": 12728.0808,
292
- "train_samples_per_second": 4.329,
293
- "train_steps_per_second": 0.004
294
  }
295
  ],
296
  "logging_steps": 2,
297
- "max_steps": 45,
298
  "num_train_epochs": 15,
299
  "save_steps": 500,
300
- "total_flos": 2.4232217682745754e+18,
301
  "trial_name": null,
302
  "trial_params": null
303
  }
 
1
  {
2
  "best_metric": 2.237640380859375,
3
  "best_model_checkpoint": "/exports/eddie/scratch/s1970716/models/summarization/longt5_xl_summ_screen_bp_only_30/checkpoint-14",
4
+ "epoch": 14.608695652173914,
5
  "eval_steps": 500,
6
+ "global_step": 210,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
58
  "eval_rouge2": 16.4662,
59
  "eval_rougeL": 28.0771,
60
  "eval_rougeLsum": 38.3405,
61
+ "eval_runtime": 1709.2671,
62
+ "eval_samples_per_second": 0.198,
63
  "eval_steps_per_second": 0.025,
64
  "step": 14
65
  },
66
  {
67
+ "epoch": 1.11,
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  "learning_rate": 0.0005,
69
+ "loss": 0.2825,
70
  "step": 16
71
  },
72
  {
73
+ "epoch": 1.25,
74
  "learning_rate": 0.0005,
75
+ "loss": 0.2835,
76
  "step": 18
77
  },
78
  {
79
+ "epoch": 1.39,
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  "learning_rate": 0.0005,
81
+ "loss": 0.2732,
82
  "step": 20
83
  },
84
  {
85
+ "epoch": 1.53,
86
  "learning_rate": 0.0005,
87
+ "loss": 0.2928,
88
  "step": 22
89
  },
90
  {
91
+ "epoch": 1.67,
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  "learning_rate": 0.0005,
93
+ "loss": 0.2912,
94
  "step": 24
95
  },
96
  {
97
+ "epoch": 1.81,
98
  "learning_rate": 0.0005,
99
+ "loss": 0.2914,
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  "step": 26
101
  },
102
  {
103
+ "epoch": 1.95,
104
  "learning_rate": 0.0005,
105
+ "loss": 0.2707,
106
  "step": 28
107
  },
108
  {
109
+ "epoch": 1.95,
110
+ "eval_gen_len": 307.3786982248521,
111
+ "eval_loss": 2.320437431335449,
112
+ "eval_rouge1": 40.2873,
113
+ "eval_rouge2": 16.7641,
114
+ "eval_rougeL": 27.3895,
115
+ "eval_rougeLsum": 38.2689,
116
+ "eval_runtime": 1775.692,
117
+ "eval_samples_per_second": 0.19,
118
+ "eval_steps_per_second": 0.024,
119
+ "step": 28
120
  },
121
  {
122
+ "epoch": 2.09,
123
  "learning_rate": 0.0005,
124
+ "loss": 0.2342,
125
  "step": 30
126
  },
127
  {
128
+ "epoch": 2.23,
129
  "learning_rate": 0.0005,
130
+ "loss": 0.2326,
131
  "step": 32
132
  },
133
  {
134
+ "epoch": 2.37,
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  "learning_rate": 0.0005,
136
+ "loss": 0.2334,
137
  "step": 34
138
  },
139
  {
140
+ "epoch": 2.5,
141
  "learning_rate": 0.0005,
142
+ "loss": 0.2212,
143
  "step": 36
144
  },
145
  {
146
+ "epoch": 2.64,
147
+ "learning_rate": 0.0005,
148
+ "loss": 0.2161,
149
+ "step": 38
150
+ },
151
+ {
152
+ "epoch": 2.78,
153
+ "learning_rate": 0.0005,
154
+ "loss": 0.2236,
155
+ "step": 40
156
+ },
157
+ {
158
+ "epoch": 2.92,
159
+ "learning_rate": 0.0005,
160
+ "loss": 0.2217,
161
+ "step": 42
162
+ },
163
+ {
164
+ "epoch": 2.99,
165
+ "eval_gen_len": 501.93195266272187,
166
+ "eval_loss": 2.5281190872192383,
167
+ "eval_rouge1": 31.9916,
168
+ "eval_rouge2": 13.8136,
169
+ "eval_rougeL": 22.1895,
170
+ "eval_rougeLsum": 30.623,
171
+ "eval_runtime": 1780.5368,
172
+ "eval_samples_per_second": 0.19,
173
  "eval_steps_per_second": 0.024,
174
+ "step": 43
175
  },
176
  {
177
+ "epoch": 3.06,
178
  "learning_rate": 0.0005,
179
+ "loss": 0.2055,
180
+ "step": 44
181
  },
182
  {
183
+ "epoch": 3.2,
184
  "learning_rate": 0.0005,
185
+ "loss": 0.1943,
186
+ "step": 46
187
  },
188
  {
189
+ "epoch": 3.34,
190
+ "learning_rate": 0.0005,
191
+ "loss": 0.1875,
192
+ "step": 48
 
 
 
 
 
 
 
193
  },
194
  {
195
+ "epoch": 3.48,
196
  "learning_rate": 0.0005,
197
+ "loss": 0.1909,
198
+ "step": 50
199
  },
200
  {
201
+ "epoch": 3.62,
202
  "learning_rate": 0.0005,
203
+ "loss": 0.1881,
204
+ "step": 52
205
  },
206
  {
207
+ "epoch": 3.76,
208
+ "learning_rate": 0.0005,
209
+ "loss": 0.1541,
210
+ "step": 54
211
+ },
212
+ {
213
+ "epoch": 3.9,
214
+ "learning_rate": 0.0005,
215
+ "loss": 0.1776,
216
+ "step": 56
217
+ },
218
+ {
219
+ "epoch": 3.97,
220
+ "eval_gen_len": 489.6183431952663,
221
+ "eval_loss": 2.7530014514923096,
222
+ "eval_rouge1": 31.7535,
223
+ "eval_rouge2": 13.8852,
224
+ "eval_rougeL": 22.8653,
225
+ "eval_rougeLsum": 30.3796,
226
+ "eval_runtime": 1780.5033,
227
+ "eval_samples_per_second": 0.19,
228
+ "eval_steps_per_second": 0.024,
229
+ "step": 57
230
+ },
231
+ {
232
+ "epoch": 4.03,
233
+ "learning_rate": 0.0005,
234
+ "loss": 0.1876,
235
+ "step": 58
236
+ },
237
+ {
238
+ "epoch": 4.17,
239
+ "learning_rate": 0.0005,
240
+ "loss": 0.1589,
241
+ "step": 60
242
+ },
243
+ {
244
+ "epoch": 4.31,
245
+ "learning_rate": 0.0005,
246
+ "loss": 0.1529,
247
+ "step": 62
248
+ },
249
+ {
250
+ "epoch": 4.45,
251
+ "learning_rate": 0.0005,
252
+ "loss": 0.1464,
253
+ "step": 64
254
+ },
255
+ {
256
+ "epoch": 4.59,
257
+ "learning_rate": 0.0005,
258
+ "loss": 0.1689,
259
+ "step": 66
260
+ },
261
+ {
262
+ "epoch": 4.73,
263
+ "learning_rate": 0.0005,
264
+ "loss": 0.1492,
265
+ "step": 68
266
+ },
267
+ {
268
+ "epoch": 4.87,
269
+ "learning_rate": 0.0005,
270
+ "loss": 0.1424,
271
+ "step": 70
272
+ },
273
+ {
274
+ "epoch": 4.94,
275
+ "eval_gen_len": 502.11242603550295,
276
+ "eval_loss": 2.657783031463623,
277
+ "eval_rouge1": 32.117,
278
+ "eval_rouge2": 14.2141,
279
+ "eval_rougeL": 22.3733,
280
+ "eval_rougeLsum": 30.8328,
281
+ "eval_runtime": 1780.0926,
282
+ "eval_samples_per_second": 0.19,
283
+ "eval_steps_per_second": 0.024,
284
+ "step": 71
285
+ },
286
+ {
287
+ "epoch": 5.01,
288
+ "learning_rate": 0.0005,
289
+ "loss": 0.1494,
290
+ "step": 72
291
+ },
292
+ {
293
+ "epoch": 5.15,
294
+ "learning_rate": 0.0005,
295
+ "loss": 0.118,
296
+ "step": 74
297
+ },
298
+ {
299
+ "epoch": 5.29,
300
+ "learning_rate": 0.0005,
301
+ "loss": 0.1499,
302
+ "step": 76
303
+ },
304
+ {
305
+ "epoch": 5.43,
306
+ "learning_rate": 0.0005,
307
+ "loss": 0.1369,
308
+ "step": 78
309
+ },
310
+ {
311
+ "epoch": 5.57,
312
+ "learning_rate": 0.0005,
313
+ "loss": 0.1084,
314
+ "step": 80
315
+ },
316
+ {
317
+ "epoch": 5.7,
318
+ "learning_rate": 0.0005,
319
+ "loss": 0.117,
320
+ "step": 82
321
+ },
322
+ {
323
+ "epoch": 5.84,
324
+ "learning_rate": 0.0005,
325
+ "loss": 0.144,
326
+ "step": 84
327
+ },
328
+ {
329
+ "epoch": 5.98,
330
+ "learning_rate": 0.0005,
331
+ "loss": 0.1449,
332
+ "step": 86
333
+ },
334
+ {
335
+ "epoch": 5.98,
336
+ "eval_gen_len": 357.31360946745565,
337
+ "eval_loss": 2.5508346557617188,
338
+ "eval_rouge1": 35.3448,
339
+ "eval_rouge2": 13.8478,
340
+ "eval_rougeL": 24.9044,
341
+ "eval_rougeLsum": 33.6108,
342
+ "eval_runtime": 1768.966,
343
+ "eval_samples_per_second": 0.191,
344
+ "eval_steps_per_second": 0.024,
345
+ "step": 86
346
+ },
347
+ {
348
+ "epoch": 6.12,
349
+ "learning_rate": 0.0005,
350
+ "loss": 0.1101,
351
+ "step": 88
352
+ },
353
+ {
354
+ "epoch": 6.26,
355
+ "learning_rate": 0.0005,
356
+ "loss": 0.0985,
357
+ "step": 90
358
+ },
359
+ {
360
+ "epoch": 6.4,
361
+ "learning_rate": 0.0005,
362
+ "loss": 0.1101,
363
+ "step": 92
364
+ },
365
+ {
366
+ "epoch": 6.54,
367
+ "learning_rate": 0.0005,
368
+ "loss": 0.1013,
369
+ "step": 94
370
+ },
371
+ {
372
+ "epoch": 6.68,
373
+ "learning_rate": 0.0005,
374
+ "loss": 0.1057,
375
+ "step": 96
376
+ },
377
+ {
378
+ "epoch": 6.82,
379
+ "learning_rate": 0.0005,
380
+ "loss": 0.1102,
381
+ "step": 98
382
+ },
383
+ {
384
+ "epoch": 6.96,
385
+ "learning_rate": 0.0005,
386
+ "loss": 0.1191,
387
+ "step": 100
388
+ },
389
+ {
390
+ "epoch": 6.96,
391
+ "eval_gen_len": 408.86686390532543,
392
+ "eval_loss": 3.162177324295044,
393
+ "eval_rouge1": 37.2189,
394
+ "eval_rouge2": 16.0076,
395
+ "eval_rougeL": 25.7011,
396
+ "eval_rougeLsum": 35.294,
397
+ "eval_runtime": 1778.8704,
398
+ "eval_samples_per_second": 0.19,
399
+ "eval_steps_per_second": 0.024,
400
+ "step": 100
401
+ },
402
+ {
403
+ "epoch": 7.1,
404
+ "learning_rate": 0.0005,
405
+ "loss": 0.1,
406
+ "step": 102
407
+ },
408
+ {
409
+ "epoch": 7.23,
410
+ "learning_rate": 0.0005,
411
+ "loss": 0.0949,
412
+ "step": 104
413
+ },
414
+ {
415
+ "epoch": 7.37,
416
+ "learning_rate": 0.0005,
417
+ "loss": 0.0974,
418
+ "step": 106
419
+ },
420
+ {
421
+ "epoch": 7.51,
422
+ "learning_rate": 0.0005,
423
+ "loss": 0.096,
424
+ "step": 108
425
+ },
426
+ {
427
+ "epoch": 7.65,
428
+ "learning_rate": 0.0005,
429
+ "loss": 0.1023,
430
+ "step": 110
431
+ },
432
+ {
433
+ "epoch": 7.79,
434
+ "learning_rate": 0.0005,
435
+ "loss": 0.0829,
436
+ "step": 112
437
+ },
438
+ {
439
+ "epoch": 7.93,
440
+ "learning_rate": 0.0005,
441
+ "loss": 0.0879,
442
+ "step": 114
443
+ },
444
+ {
445
+ "epoch": 8.0,
446
+ "eval_gen_len": 318.2278106508876,
447
+ "eval_loss": 2.8510310649871826,
448
+ "eval_rouge1": 39.8825,
449
+ "eval_rouge2": 16.8073,
450
+ "eval_rougeL": 27.2428,
451
+ "eval_rougeLsum": 37.9568,
452
+ "eval_runtime": 1776.4341,
453
+ "eval_samples_per_second": 0.19,
454
+ "eval_steps_per_second": 0.024,
455
+ "step": 115
456
+ },
457
+ {
458
+ "epoch": 8.07,
459
+ "learning_rate": 0.0005,
460
+ "loss": 0.0957,
461
+ "step": 116
462
+ },
463
+ {
464
+ "epoch": 8.21,
465
+ "learning_rate": 0.0005,
466
+ "loss": 0.0991,
467
+ "step": 118
468
+ },
469
+ {
470
+ "epoch": 8.35,
471
+ "learning_rate": 0.0005,
472
+ "loss": 0.095,
473
+ "step": 120
474
+ },
475
+ {
476
+ "epoch": 8.49,
477
+ "learning_rate": 0.0005,
478
+ "loss": 0.0999,
479
+ "step": 122
480
+ },
481
+ {
482
+ "epoch": 8.63,
483
+ "learning_rate": 0.0005,
484
+ "loss": 0.0787,
485
+ "step": 124
486
+ },
487
+ {
488
+ "epoch": 8.77,
489
+ "learning_rate": 0.0005,
490
+ "loss": 0.0716,
491
+ "step": 126
492
+ },
493
+ {
494
+ "epoch": 8.9,
495
+ "learning_rate": 0.0005,
496
+ "loss": 0.0899,
497
+ "step": 128
498
+ },
499
+ {
500
+ "epoch": 8.97,
501
+ "eval_gen_len": 500.405325443787,
502
+ "eval_loss": 2.9137675762176514,
503
+ "eval_rouge1": 31.7139,
504
+ "eval_rouge2": 13.7066,
505
+ "eval_rougeL": 21.8844,
506
+ "eval_rougeLsum": 30.5075,
507
+ "eval_runtime": 1780.3113,
508
+ "eval_samples_per_second": 0.19,
509
+ "eval_steps_per_second": 0.024,
510
+ "step": 129
511
+ },
512
+ {
513
+ "epoch": 9.04,
514
+ "learning_rate": 0.0005,
515
+ "loss": 0.0945,
516
+ "step": 130
517
+ },
518
+ {
519
+ "epoch": 9.18,
520
+ "learning_rate": 0.0005,
521
+ "loss": 0.0597,
522
+ "step": 132
523
+ },
524
+ {
525
+ "epoch": 9.32,
526
+ "learning_rate": 0.0005,
527
+ "loss": 0.0586,
528
+ "step": 134
529
+ },
530
+ {
531
+ "epoch": 9.46,
532
+ "learning_rate": 0.0005,
533
+ "loss": 0.0757,
534
+ "step": 136
535
+ },
536
+ {
537
+ "epoch": 9.6,
538
+ "learning_rate": 0.0005,
539
+ "loss": 0.0665,
540
+ "step": 138
541
+ },
542
+ {
543
+ "epoch": 9.74,
544
+ "learning_rate": 0.0005,
545
+ "loss": 0.0594,
546
+ "step": 140
547
+ },
548
+ {
549
+ "epoch": 9.88,
550
+ "learning_rate": 0.0005,
551
+ "loss": 0.0656,
552
+ "step": 142
553
+ },
554
+ {
555
+ "epoch": 9.95,
556
+ "eval_gen_len": 488.1686390532544,
557
+ "eval_loss": 3.1616334915161133,
558
+ "eval_rouge1": 33.055,
559
+ "eval_rouge2": 14.5841,
560
+ "eval_rougeL": 22.5883,
561
+ "eval_rougeLsum": 31.7565,
562
+ "eval_runtime": 1782.2406,
563
+ "eval_samples_per_second": 0.19,
564
+ "eval_steps_per_second": 0.024,
565
+ "step": 143
566
+ },
567
+ {
568
+ "epoch": 10.02,
569
+ "learning_rate": 0.0005,
570
+ "loss": 0.0613,
571
+ "step": 144
572
+ },
573
+ {
574
+ "epoch": 10.16,
575
+ "learning_rate": 0.0005,
576
+ "loss": 0.0621,
577
+ "step": 146
578
+ },
579
+ {
580
+ "epoch": 10.3,
581
+ "learning_rate": 0.0005,
582
+ "loss": 0.0739,
583
+ "step": 148
584
+ },
585
+ {
586
+ "epoch": 10.43,
587
+ "learning_rate": 0.0005,
588
+ "loss": 0.0841,
589
+ "step": 150
590
+ },
591
+ {
592
+ "epoch": 10.57,
593
+ "learning_rate": 0.0005,
594
+ "loss": 0.0562,
595
+ "step": 152
596
+ },
597
+ {
598
+ "epoch": 10.71,
599
+ "learning_rate": 0.0005,
600
+ "loss": 0.0611,
601
+ "step": 154
602
+ },
603
+ {
604
+ "epoch": 10.85,
605
+ "learning_rate": 0.0005,
606
+ "loss": 0.0607,
607
+ "step": 156
608
+ },
609
+ {
610
+ "epoch": 10.99,
611
+ "learning_rate": 0.0005,
612
+ "loss": 0.0542,
613
+ "step": 158
614
+ },
615
+ {
616
+ "epoch": 10.99,
617
+ "eval_gen_len": 198.80769230769232,
618
+ "eval_loss": 3.3630056381225586,
619
+ "eval_rouge1": 43.7514,
620
+ "eval_rouge2": 18.9011,
621
+ "eval_rougeL": 29.9017,
622
+ "eval_rougeLsum": 41.6887,
623
+ "eval_runtime": 1460.3937,
624
+ "eval_samples_per_second": 0.231,
625
+ "eval_steps_per_second": 0.029,
626
+ "step": 158
627
+ },
628
+ {
629
+ "epoch": 11.13,
630
+ "learning_rate": 0.0005,
631
+ "loss": 0.0548,
632
+ "step": 160
633
+ },
634
+ {
635
+ "epoch": 11.27,
636
+ "learning_rate": 0.0005,
637
+ "loss": 0.0477,
638
+ "step": 162
639
+ },
640
+ {
641
+ "epoch": 11.41,
642
+ "learning_rate": 0.0005,
643
+ "loss": 0.052,
644
+ "step": 164
645
+ },
646
+ {
647
+ "epoch": 11.55,
648
+ "learning_rate": 0.0005,
649
+ "loss": 0.053,
650
+ "step": 166
651
+ },
652
+ {
653
+ "epoch": 11.69,
654
+ "learning_rate": 0.0005,
655
+ "loss": 0.0525,
656
+ "step": 168
657
+ },
658
+ {
659
+ "epoch": 11.83,
660
+ "learning_rate": 0.0005,
661
+ "loss": 0.0555,
662
+ "step": 170
663
+ },
664
+ {
665
+ "epoch": 11.97,
666
+ "learning_rate": 0.0005,
667
+ "loss": 0.0557,
668
+ "step": 172
669
+ },
670
+ {
671
+ "epoch": 11.97,
672
+ "eval_gen_len": 270.9674556213018,
673
+ "eval_loss": 3.3825831413269043,
674
+ "eval_rouge1": 42.3089,
675
+ "eval_rouge2": 18.2735,
676
+ "eval_rougeL": 29.0356,
677
+ "eval_rougeLsum": 40.4154,
678
+ "eval_runtime": 1704.6973,
679
+ "eval_samples_per_second": 0.198,
680
+ "eval_steps_per_second": 0.025,
681
+ "step": 172
682
+ },
683
+ {
684
+ "epoch": 12.1,
685
+ "learning_rate": 0.0005,
686
+ "loss": 0.0513,
687
+ "step": 174
688
+ },
689
+ {
690
+ "epoch": 12.24,
691
+ "learning_rate": 0.0005,
692
+ "loss": 0.0419,
693
+ "step": 176
694
+ },
695
+ {
696
+ "epoch": 12.38,
697
+ "learning_rate": 0.0005,
698
+ "loss": 0.0525,
699
+ "step": 178
700
+ },
701
+ {
702
+ "epoch": 12.52,
703
+ "learning_rate": 0.0005,
704
+ "loss": 0.0535,
705
+ "step": 180
706
+ },
707
+ {
708
+ "epoch": 12.66,
709
+ "learning_rate": 0.0005,
710
+ "loss": 0.0611,
711
+ "step": 182
712
+ },
713
+ {
714
+ "epoch": 12.8,
715
+ "learning_rate": 0.0005,
716
+ "loss": 0.0444,
717
+ "step": 184
718
+ },
719
+ {
720
+ "epoch": 12.94,
721
+ "learning_rate": 0.0005,
722
+ "loss": 0.0542,
723
+ "step": 186
724
+ },
725
+ {
726
+ "epoch": 12.94,
727
+ "eval_gen_len": 186.73076923076923,
728
+ "eval_loss": 3.4408490657806396,
729
+ "eval_rouge1": 40.7691,
730
+ "eval_rouge2": 16.529,
731
+ "eval_rougeL": 28.3999,
732
+ "eval_rougeLsum": 38.9723,
733
+ "eval_runtime": 1525.6668,
734
+ "eval_samples_per_second": 0.222,
735
+ "eval_steps_per_second": 0.028,
736
+ "step": 186
737
+ },
738
+ {
739
+ "epoch": 13.08,
740
+ "learning_rate": 0.0005,
741
+ "loss": 0.0602,
742
+ "step": 188
743
+ },
744
+ {
745
+ "epoch": 13.22,
746
+ "learning_rate": 0.0005,
747
+ "loss": 0.0438,
748
+ "step": 190
749
+ },
750
+ {
751
+ "epoch": 13.36,
752
+ "learning_rate": 0.0005,
753
+ "loss": 0.0503,
754
+ "step": 192
755
+ },
756
+ {
757
+ "epoch": 13.5,
758
+ "learning_rate": 0.0005,
759
+ "loss": 0.046,
760
+ "step": 194
761
+ },
762
+ {
763
+ "epoch": 13.63,
764
+ "learning_rate": 0.0005,
765
+ "loss": 0.0368,
766
+ "step": 196
767
+ },
768
+ {
769
+ "epoch": 13.77,
770
+ "learning_rate": 0.0005,
771
+ "loss": 0.0572,
772
+ "step": 198
773
+ },
774
+ {
775
+ "epoch": 13.91,
776
+ "learning_rate": 0.0005,
777
+ "loss": 0.0596,
778
+ "step": 200
779
+ },
780
+ {
781
+ "epoch": 13.98,
782
+ "eval_gen_len": 398.4704142011834,
783
+ "eval_loss": 3.525272846221924,
784
+ "eval_rouge1": 37.0037,
785
+ "eval_rouge2": 15.9098,
786
+ "eval_rougeL": 25.2808,
787
+ "eval_rougeLsum": 35.3868,
788
+ "eval_runtime": 1778.3289,
789
+ "eval_samples_per_second": 0.19,
790
+ "eval_steps_per_second": 0.024,
791
+ "step": 201
792
+ },
793
+ {
794
+ "epoch": 14.05,
795
+ "learning_rate": 0.0005,
796
+ "loss": 0.0434,
797
+ "step": 202
798
+ },
799
+ {
800
+ "epoch": 14.19,
801
+ "learning_rate": 0.0005,
802
+ "loss": 0.0453,
803
+ "step": 204
804
+ },
805
+ {
806
+ "epoch": 14.33,
807
+ "learning_rate": 0.0005,
808
+ "loss": 0.0453,
809
+ "step": 206
810
+ },
811
+ {
812
+ "epoch": 14.47,
813
+ "learning_rate": 0.0005,
814
+ "loss": 0.0586,
815
+ "step": 208
816
+ },
817
+ {
818
+ "epoch": 14.61,
819
+ "learning_rate": 0.0005,
820
+ "loss": 0.0385,
821
+ "step": 210
822
+ },
823
+ {
824
+ "epoch": 14.61,
825
+ "eval_gen_len": 499.31065088757396,
826
+ "eval_loss": 3.498972177505493,
827
+ "eval_rouge1": 32.5815,
828
+ "eval_rouge2": 14.2951,
829
+ "eval_rougeL": 22.4501,
830
+ "eval_rougeLsum": 31.2928,
831
+ "eval_runtime": 1779.9602,
832
+ "eval_samples_per_second": 0.19,
833
+ "eval_steps_per_second": 0.024,
834
+ "step": 210
835
  },
836
  {
837
+ "epoch": 14.61,
838
+ "step": 210,
839
+ "total_flos": 3.6715371967648973e+18,
840
+ "train_loss": 0.13047422234501158,
841
+ "train_runtime": 78176.4332,
842
+ "train_samples_per_second": 0.705,
843
+ "train_steps_per_second": 0.003
 
 
 
 
 
 
 
 
 
 
 
 
 
844
  }
845
  ],
846
  "logging_steps": 2,
847
+ "max_steps": 210,
848
  "num_train_epochs": 15,
849
  "save_steps": 500,
850
+ "total_flos": 3.6715371967648973e+18,
851
  "trial_name": null,
852
  "trial_params": null
853
  }