farleyknight commited on
Commit
4b000ae
1 Parent(s): c142e23

End of training

Browse files
Files changed (4) hide show
  1. all_results.json +18 -0
  2. eval_results.json +13 -0
  3. train_results.json +8 -0
  4. trainer_state.json +572 -0
all_results.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "eval_gen_len": 122.00605966438782,
4
+ "eval_loss": 2.299438714981079,
5
+ "eval_rouge1": 37.7982,
6
+ "eval_rouge2": 12.5571,
7
+ "eval_rougeL": 24.9325,
8
+ "eval_rougeLsum": 32.9189,
9
+ "eval_runtime": 1727.733,
10
+ "eval_samples": 6436,
11
+ "eval_samples_per_second": 3.725,
12
+ "eval_steps_per_second": 0.466,
13
+ "train_loss": 2.480459571265385,
14
+ "train_runtime": 37543.9323,
15
+ "train_samples": 202914,
16
+ "train_samples_per_second": 16.214,
17
+ "train_steps_per_second": 2.027
18
+ }
eval_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "eval_gen_len": 122.00605966438782,
4
+ "eval_loss": 2.299438714981079,
5
+ "eval_rouge1": 37.7982,
6
+ "eval_rouge2": 12.5571,
7
+ "eval_rougeL": 24.9325,
8
+ "eval_rougeLsum": 32.9189,
9
+ "eval_runtime": 1727.733,
10
+ "eval_samples": 6436,
11
+ "eval_samples_per_second": 3.725,
12
+ "eval_steps_per_second": 0.466
13
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "train_loss": 2.480459571265385,
4
+ "train_runtime": 37543.9323,
5
+ "train_samples": 202914,
6
+ "train_samples_per_second": 16.214,
7
+ "train_steps_per_second": 2.027
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,572 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "global_step": 76095,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.04,
12
+ "learning_rate": 4.9342926604901765e-05,
13
+ "loss": 2.907,
14
+ "step": 1000
15
+ },
16
+ {
17
+ "epoch": 0.08,
18
+ "learning_rate": 4.868585320980354e-05,
19
+ "loss": 2.7863,
20
+ "step": 2000
21
+ },
22
+ {
23
+ "epoch": 0.12,
24
+ "learning_rate": 4.8028779814705304e-05,
25
+ "loss": 2.7315,
26
+ "step": 3000
27
+ },
28
+ {
29
+ "epoch": 0.16,
30
+ "learning_rate": 4.737170641960707e-05,
31
+ "loss": 2.7056,
32
+ "step": 4000
33
+ },
34
+ {
35
+ "epoch": 0.2,
36
+ "learning_rate": 4.671463302450884e-05,
37
+ "loss": 2.6726,
38
+ "step": 5000
39
+ },
40
+ {
41
+ "epoch": 0.24,
42
+ "learning_rate": 4.6057559629410605e-05,
43
+ "loss": 2.6651,
44
+ "step": 6000
45
+ },
46
+ {
47
+ "epoch": 0.28,
48
+ "learning_rate": 4.5400486234312375e-05,
49
+ "loss": 2.625,
50
+ "step": 7000
51
+ },
52
+ {
53
+ "epoch": 0.32,
54
+ "learning_rate": 4.4743412839214144e-05,
55
+ "loss": 2.6331,
56
+ "step": 8000
57
+ },
58
+ {
59
+ "epoch": 0.35,
60
+ "learning_rate": 4.408633944411591e-05,
61
+ "loss": 2.6084,
62
+ "step": 9000
63
+ },
64
+ {
65
+ "epoch": 0.39,
66
+ "learning_rate": 4.3429266049017676e-05,
67
+ "loss": 2.5925,
68
+ "step": 10000
69
+ },
70
+ {
71
+ "epoch": 0.39,
72
+ "eval_gen_len": 19.0,
73
+ "eval_loss": 2.45664119720459,
74
+ "eval_rouge1": 17.8432,
75
+ "eval_rouge2": 6.6779,
76
+ "eval_rougeL": 14.2303,
77
+ "eval_rougeLsum": 16.1952,
78
+ "eval_runtime": 446.6007,
79
+ "eval_samples_per_second": 14.411,
80
+ "eval_steps_per_second": 1.803,
81
+ "step": 10000
82
+ },
83
+ {
84
+ "epoch": 0.43,
85
+ "learning_rate": 4.2772192653919446e-05,
86
+ "loss": 2.5905,
87
+ "step": 11000
88
+ },
89
+ {
90
+ "epoch": 0.47,
91
+ "learning_rate": 4.2115119258821215e-05,
92
+ "loss": 2.5838,
93
+ "step": 12000
94
+ },
95
+ {
96
+ "epoch": 0.51,
97
+ "learning_rate": 4.145804586372298e-05,
98
+ "loss": 2.565,
99
+ "step": 13000
100
+ },
101
+ {
102
+ "epoch": 0.55,
103
+ "learning_rate": 4.080097246862475e-05,
104
+ "loss": 2.5687,
105
+ "step": 14000
106
+ },
107
+ {
108
+ "epoch": 0.59,
109
+ "learning_rate": 4.0143899073526516e-05,
110
+ "loss": 2.5542,
111
+ "step": 15000
112
+ },
113
+ {
114
+ "epoch": 0.63,
115
+ "learning_rate": 3.948682567842828e-05,
116
+ "loss": 2.5402,
117
+ "step": 16000
118
+ },
119
+ {
120
+ "epoch": 0.67,
121
+ "learning_rate": 3.882975228333005e-05,
122
+ "loss": 2.5368,
123
+ "step": 17000
124
+ },
125
+ {
126
+ "epoch": 0.71,
127
+ "learning_rate": 3.817267888823182e-05,
128
+ "loss": 2.5261,
129
+ "step": 18000
130
+ },
131
+ {
132
+ "epoch": 0.75,
133
+ "learning_rate": 3.751560549313358e-05,
134
+ "loss": 2.5257,
135
+ "step": 19000
136
+ },
137
+ {
138
+ "epoch": 0.79,
139
+ "learning_rate": 3.6858532098035357e-05,
140
+ "loss": 2.518,
141
+ "step": 20000
142
+ },
143
+ {
144
+ "epoch": 0.79,
145
+ "eval_gen_len": 19.0,
146
+ "eval_loss": 2.3868377208709717,
147
+ "eval_rouge1": 18.0354,
148
+ "eval_rouge2": 6.8565,
149
+ "eval_rougeL": 14.3552,
150
+ "eval_rougeLsum": 16.3664,
151
+ "eval_runtime": 450.4559,
152
+ "eval_samples_per_second": 14.288,
153
+ "eval_steps_per_second": 1.787,
154
+ "step": 20000
155
+ },
156
+ {
157
+ "epoch": 0.83,
158
+ "learning_rate": 3.620145870293712e-05,
159
+ "loss": 2.5159,
160
+ "step": 21000
161
+ },
162
+ {
163
+ "epoch": 0.87,
164
+ "learning_rate": 3.554438530783889e-05,
165
+ "loss": 2.4994,
166
+ "step": 22000
167
+ },
168
+ {
169
+ "epoch": 0.91,
170
+ "learning_rate": 3.488731191274066e-05,
171
+ "loss": 2.4993,
172
+ "step": 23000
173
+ },
174
+ {
175
+ "epoch": 0.95,
176
+ "learning_rate": 3.423023851764242e-05,
177
+ "loss": 2.5103,
178
+ "step": 24000
179
+ },
180
+ {
181
+ "epoch": 0.99,
182
+ "learning_rate": 3.357316512254419e-05,
183
+ "loss": 2.4925,
184
+ "step": 25000
185
+ },
186
+ {
187
+ "epoch": 1.03,
188
+ "learning_rate": 3.291609172744596e-05,
189
+ "loss": 2.4812,
190
+ "step": 26000
191
+ },
192
+ {
193
+ "epoch": 1.06,
194
+ "learning_rate": 3.225901833234772e-05,
195
+ "loss": 2.4806,
196
+ "step": 27000
197
+ },
198
+ {
199
+ "epoch": 1.1,
200
+ "learning_rate": 3.160194493724949e-05,
201
+ "loss": 2.4638,
202
+ "step": 28000
203
+ },
204
+ {
205
+ "epoch": 1.14,
206
+ "learning_rate": 3.094487154215126e-05,
207
+ "loss": 2.4676,
208
+ "step": 29000
209
+ },
210
+ {
211
+ "epoch": 1.18,
212
+ "learning_rate": 3.0287798147053027e-05,
213
+ "loss": 2.4587,
214
+ "step": 30000
215
+ },
216
+ {
217
+ "epoch": 1.18,
218
+ "eval_gen_len": 19.0,
219
+ "eval_loss": 2.3600151538848877,
220
+ "eval_rouge1": 18.2076,
221
+ "eval_rouge2": 6.9618,
222
+ "eval_rougeL": 14.5349,
223
+ "eval_rougeLsum": 16.5626,
224
+ "eval_runtime": 450.2674,
225
+ "eval_samples_per_second": 14.294,
226
+ "eval_steps_per_second": 1.788,
227
+ "step": 30000
228
+ },
229
+ {
230
+ "epoch": 1.22,
231
+ "learning_rate": 2.9630724751954796e-05,
232
+ "loss": 2.453,
233
+ "step": 31000
234
+ },
235
+ {
236
+ "epoch": 1.26,
237
+ "learning_rate": 2.8973651356856562e-05,
238
+ "loss": 2.452,
239
+ "step": 32000
240
+ },
241
+ {
242
+ "epoch": 1.3,
243
+ "learning_rate": 2.831657796175833e-05,
244
+ "loss": 2.4553,
245
+ "step": 33000
246
+ },
247
+ {
248
+ "epoch": 1.34,
249
+ "learning_rate": 2.7659504566660098e-05,
250
+ "loss": 2.4527,
251
+ "step": 34000
252
+ },
253
+ {
254
+ "epoch": 1.38,
255
+ "learning_rate": 2.7002431171561864e-05,
256
+ "loss": 2.4478,
257
+ "step": 35000
258
+ },
259
+ {
260
+ "epoch": 1.42,
261
+ "learning_rate": 2.634535777646363e-05,
262
+ "loss": 2.4563,
263
+ "step": 36000
264
+ },
265
+ {
266
+ "epoch": 1.46,
267
+ "learning_rate": 2.5688284381365403e-05,
268
+ "loss": 2.4402,
269
+ "step": 37000
270
+ },
271
+ {
272
+ "epoch": 1.5,
273
+ "learning_rate": 2.503121098626717e-05,
274
+ "loss": 2.447,
275
+ "step": 38000
276
+ },
277
+ {
278
+ "epoch": 1.54,
279
+ "learning_rate": 2.4374137591168935e-05,
280
+ "loss": 2.4436,
281
+ "step": 39000
282
+ },
283
+ {
284
+ "epoch": 1.58,
285
+ "learning_rate": 2.37170641960707e-05,
286
+ "loss": 2.4365,
287
+ "step": 40000
288
+ },
289
+ {
290
+ "epoch": 1.58,
291
+ "eval_gen_len": 19.0,
292
+ "eval_loss": 2.329491138458252,
293
+ "eval_rouge1": 18.3579,
294
+ "eval_rouge2": 7.0312,
295
+ "eval_rougeL": 14.6145,
296
+ "eval_rougeLsum": 16.6845,
297
+ "eval_runtime": 446.3631,
298
+ "eval_samples_per_second": 14.419,
299
+ "eval_steps_per_second": 1.803,
300
+ "step": 40000
301
+ },
302
+ {
303
+ "epoch": 1.62,
304
+ "learning_rate": 2.305999080097247e-05,
305
+ "loss": 2.4393,
306
+ "step": 41000
307
+ },
308
+ {
309
+ "epoch": 1.66,
310
+ "learning_rate": 2.240291740587424e-05,
311
+ "loss": 2.4266,
312
+ "step": 42000
313
+ },
314
+ {
315
+ "epoch": 1.7,
316
+ "learning_rate": 2.1745844010776002e-05,
317
+ "loss": 2.4337,
318
+ "step": 43000
319
+ },
320
+ {
321
+ "epoch": 1.73,
322
+ "learning_rate": 2.108877061567777e-05,
323
+ "loss": 2.4238,
324
+ "step": 44000
325
+ },
326
+ {
327
+ "epoch": 1.77,
328
+ "learning_rate": 2.043169722057954e-05,
329
+ "loss": 2.4304,
330
+ "step": 45000
331
+ },
332
+ {
333
+ "epoch": 1.81,
334
+ "learning_rate": 1.9774623825481307e-05,
335
+ "loss": 2.4267,
336
+ "step": 46000
337
+ },
338
+ {
339
+ "epoch": 1.85,
340
+ "learning_rate": 1.9117550430383073e-05,
341
+ "loss": 2.4263,
342
+ "step": 47000
343
+ },
344
+ {
345
+ "epoch": 1.89,
346
+ "learning_rate": 1.8460477035284842e-05,
347
+ "loss": 2.4328,
348
+ "step": 48000
349
+ },
350
+ {
351
+ "epoch": 1.93,
352
+ "learning_rate": 1.7803403640186608e-05,
353
+ "loss": 2.4215,
354
+ "step": 49000
355
+ },
356
+ {
357
+ "epoch": 1.97,
358
+ "learning_rate": 1.7146330245088378e-05,
359
+ "loss": 2.4306,
360
+ "step": 50000
361
+ },
362
+ {
363
+ "epoch": 1.97,
364
+ "eval_gen_len": 19.0,
365
+ "eval_loss": 2.3189666271209717,
366
+ "eval_rouge1": 18.4551,
367
+ "eval_rouge2": 7.0861,
368
+ "eval_rougeL": 14.6879,
369
+ "eval_rougeLsum": 16.7627,
370
+ "eval_runtime": 447.1376,
371
+ "eval_samples_per_second": 14.394,
372
+ "eval_steps_per_second": 1.8,
373
+ "step": 50000
374
+ },
375
+ {
376
+ "epoch": 2.01,
377
+ "learning_rate": 1.6489256849990144e-05,
378
+ "loss": 2.4224,
379
+ "step": 51000
380
+ },
381
+ {
382
+ "epoch": 2.05,
383
+ "learning_rate": 1.5832183454891913e-05,
384
+ "loss": 2.414,
385
+ "step": 52000
386
+ },
387
+ {
388
+ "epoch": 2.09,
389
+ "learning_rate": 1.5175110059793679e-05,
390
+ "loss": 2.4029,
391
+ "step": 53000
392
+ },
393
+ {
394
+ "epoch": 2.13,
395
+ "learning_rate": 1.4518036664695447e-05,
396
+ "loss": 2.4104,
397
+ "step": 54000
398
+ },
399
+ {
400
+ "epoch": 2.17,
401
+ "learning_rate": 1.3860963269597216e-05,
402
+ "loss": 2.4117,
403
+ "step": 55000
404
+ },
405
+ {
406
+ "epoch": 2.21,
407
+ "learning_rate": 1.320388987449898e-05,
408
+ "loss": 2.4096,
409
+ "step": 56000
410
+ },
411
+ {
412
+ "epoch": 2.25,
413
+ "learning_rate": 1.254681647940075e-05,
414
+ "loss": 2.4072,
415
+ "step": 57000
416
+ },
417
+ {
418
+ "epoch": 2.29,
419
+ "learning_rate": 1.1889743084302516e-05,
420
+ "loss": 2.4062,
421
+ "step": 58000
422
+ },
423
+ {
424
+ "epoch": 2.33,
425
+ "learning_rate": 1.1232669689204285e-05,
426
+ "loss": 2.4014,
427
+ "step": 59000
428
+ },
429
+ {
430
+ "epoch": 2.37,
431
+ "learning_rate": 1.0575596294106051e-05,
432
+ "loss": 2.4005,
433
+ "step": 60000
434
+ },
435
+ {
436
+ "epoch": 2.37,
437
+ "eval_gen_len": 19.0,
438
+ "eval_loss": 2.3056259155273438,
439
+ "eval_rouge1": 18.3521,
440
+ "eval_rouge2": 7.0496,
441
+ "eval_rougeL": 14.6413,
442
+ "eval_rougeLsum": 16.6832,
443
+ "eval_runtime": 444.8978,
444
+ "eval_samples_per_second": 14.466,
445
+ "eval_steps_per_second": 1.809,
446
+ "step": 60000
447
+ },
448
+ {
449
+ "epoch": 2.4,
450
+ "learning_rate": 9.918522899007819e-06,
451
+ "loss": 2.4054,
452
+ "step": 61000
453
+ },
454
+ {
455
+ "epoch": 2.44,
456
+ "learning_rate": 9.261449503909587e-06,
457
+ "loss": 2.4074,
458
+ "step": 62000
459
+ },
460
+ {
461
+ "epoch": 2.48,
462
+ "learning_rate": 8.604376108811355e-06,
463
+ "loss": 2.3965,
464
+ "step": 63000
465
+ },
466
+ {
467
+ "epoch": 2.52,
468
+ "learning_rate": 7.947302713713122e-06,
469
+ "loss": 2.3964,
470
+ "step": 64000
471
+ },
472
+ {
473
+ "epoch": 2.56,
474
+ "learning_rate": 7.29022931861489e-06,
475
+ "loss": 2.3965,
476
+ "step": 65000
477
+ },
478
+ {
479
+ "epoch": 2.6,
480
+ "learning_rate": 6.633155923516657e-06,
481
+ "loss": 2.3934,
482
+ "step": 66000
483
+ },
484
+ {
485
+ "epoch": 2.64,
486
+ "learning_rate": 5.9760825284184245e-06,
487
+ "loss": 2.4052,
488
+ "step": 67000
489
+ },
490
+ {
491
+ "epoch": 2.68,
492
+ "learning_rate": 5.319009133320192e-06,
493
+ "loss": 2.397,
494
+ "step": 68000
495
+ },
496
+ {
497
+ "epoch": 2.72,
498
+ "learning_rate": 4.66193573822196e-06,
499
+ "loss": 2.3875,
500
+ "step": 69000
501
+ },
502
+ {
503
+ "epoch": 2.76,
504
+ "learning_rate": 4.004862343123727e-06,
505
+ "loss": 2.396,
506
+ "step": 70000
507
+ },
508
+ {
509
+ "epoch": 2.76,
510
+ "eval_gen_len": 19.0,
511
+ "eval_loss": 2.3012354373931885,
512
+ "eval_rouge1": 18.348,
513
+ "eval_rouge2": 7.0439,
514
+ "eval_rougeL": 14.6509,
515
+ "eval_rougeLsum": 16.6994,
516
+ "eval_runtime": 450.1278,
517
+ "eval_samples_per_second": 14.298,
518
+ "eval_steps_per_second": 1.788,
519
+ "step": 70000
520
+ },
521
+ {
522
+ "epoch": 2.8,
523
+ "learning_rate": 3.3477889480254945e-06,
524
+ "loss": 2.3949,
525
+ "step": 71000
526
+ },
527
+ {
528
+ "epoch": 2.84,
529
+ "learning_rate": 2.6907155529272622e-06,
530
+ "loss": 2.4087,
531
+ "step": 72000
532
+ },
533
+ {
534
+ "epoch": 2.88,
535
+ "learning_rate": 2.0336421578290295e-06,
536
+ "loss": 2.3957,
537
+ "step": 73000
538
+ },
539
+ {
540
+ "epoch": 2.92,
541
+ "learning_rate": 1.376568762730797e-06,
542
+ "loss": 2.3948,
543
+ "step": 74000
544
+ },
545
+ {
546
+ "epoch": 2.96,
547
+ "learning_rate": 7.194953676325646e-07,
548
+ "loss": 2.3925,
549
+ "step": 75000
550
+ },
551
+ {
552
+ "epoch": 3.0,
553
+ "learning_rate": 6.242197253433209e-08,
554
+ "loss": 2.3883,
555
+ "step": 76000
556
+ },
557
+ {
558
+ "epoch": 3.0,
559
+ "step": 76095,
560
+ "total_flos": 1.6477647782333645e+17,
561
+ "train_loss": 2.480459571265385,
562
+ "train_runtime": 37543.9323,
563
+ "train_samples_per_second": 16.214,
564
+ "train_steps_per_second": 2.027
565
+ }
566
+ ],
567
+ "max_steps": 76095,
568
+ "num_train_epochs": 3,
569
+ "total_flos": 1.6477647782333645e+17,
570
+ "trial_name": null,
571
+ "trial_params": null
572
+ }