phosseini commited on
Commit
1d57b6f
1 Parent(s): 44f308d

Upload trainer_state.json

Browse files
Files changed (1) hide show
  1. trainer_state.json +1118 -0
trainer_state.json ADDED
@@ -0,0 +1,1118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.7937971949577332,
3
+ "best_model_checkpoint": "models/checkpoints/checkpoint-14500",
4
+ "epoch": 7.611468311245243,
5
+ "global_step": 14500,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.05,
12
+ "learning_rate": 1.9895013123359582e-05,
13
+ "loss": 1.3844,
14
+ "step": 100
15
+ },
16
+ {
17
+ "epoch": 0.1,
18
+ "learning_rate": 1.9790026246719162e-05,
19
+ "loss": 1.1178,
20
+ "step": 200
21
+ },
22
+ {
23
+ "epoch": 0.16,
24
+ "learning_rate": 1.9685039370078743e-05,
25
+ "loss": 1.0595,
26
+ "step": 300
27
+ },
28
+ {
29
+ "epoch": 0.21,
30
+ "learning_rate": 1.9580052493438323e-05,
31
+ "loss": 1.0309,
32
+ "step": 400
33
+ },
34
+ {
35
+ "epoch": 0.26,
36
+ "learning_rate": 1.94750656167979e-05,
37
+ "loss": 1.0043,
38
+ "step": 500
39
+ },
40
+ {
41
+ "epoch": 0.26,
42
+ "eval_loss": 0.9238424897193909,
43
+ "eval_runtime": 78.6171,
44
+ "eval_samples_per_second": 179.77,
45
+ "eval_steps_per_second": 11.244,
46
+ "step": 500
47
+ },
48
+ {
49
+ "epoch": 0.31,
50
+ "learning_rate": 1.937007874015748e-05,
51
+ "loss": 1.0168,
52
+ "step": 600
53
+ },
54
+ {
55
+ "epoch": 0.37,
56
+ "learning_rate": 1.9265091863517064e-05,
57
+ "loss": 0.959,
58
+ "step": 700
59
+ },
60
+ {
61
+ "epoch": 0.42,
62
+ "learning_rate": 1.916010498687664e-05,
63
+ "loss": 0.9599,
64
+ "step": 800
65
+ },
66
+ {
67
+ "epoch": 0.47,
68
+ "learning_rate": 1.905511811023622e-05,
69
+ "loss": 0.947,
70
+ "step": 900
71
+ },
72
+ {
73
+ "epoch": 0.52,
74
+ "learning_rate": 1.89501312335958e-05,
75
+ "loss": 0.9886,
76
+ "step": 1000
77
+ },
78
+ {
79
+ "epoch": 0.52,
80
+ "eval_loss": 0.8911218047142029,
81
+ "eval_runtime": 58.2514,
82
+ "eval_samples_per_second": 242.621,
83
+ "eval_steps_per_second": 15.176,
84
+ "step": 1000
85
+ },
86
+ {
87
+ "epoch": 0.58,
88
+ "learning_rate": 1.8845144356955382e-05,
89
+ "loss": 0.97,
90
+ "step": 1100
91
+ },
92
+ {
93
+ "epoch": 0.63,
94
+ "learning_rate": 1.8740157480314962e-05,
95
+ "loss": 0.9091,
96
+ "step": 1200
97
+ },
98
+ {
99
+ "epoch": 0.68,
100
+ "learning_rate": 1.8635170603674542e-05,
101
+ "loss": 0.9672,
102
+ "step": 1300
103
+ },
104
+ {
105
+ "epoch": 0.73,
106
+ "learning_rate": 1.8530183727034123e-05,
107
+ "loss": 0.9028,
108
+ "step": 1400
109
+ },
110
+ {
111
+ "epoch": 0.79,
112
+ "learning_rate": 1.8425196850393703e-05,
113
+ "loss": 0.8951,
114
+ "step": 1500
115
+ },
116
+ {
117
+ "epoch": 0.79,
118
+ "eval_loss": 0.85249924659729,
119
+ "eval_runtime": 58.049,
120
+ "eval_samples_per_second": 243.467,
121
+ "eval_steps_per_second": 15.229,
122
+ "step": 1500
123
+ },
124
+ {
125
+ "epoch": 0.84,
126
+ "learning_rate": 1.8320209973753283e-05,
127
+ "loss": 0.8988,
128
+ "step": 1600
129
+ },
130
+ {
131
+ "epoch": 0.89,
132
+ "learning_rate": 1.821522309711286e-05,
133
+ "loss": 0.9257,
134
+ "step": 1700
135
+ },
136
+ {
137
+ "epoch": 0.94,
138
+ "learning_rate": 1.811023622047244e-05,
139
+ "loss": 0.9391,
140
+ "step": 1800
141
+ },
142
+ {
143
+ "epoch": 1.0,
144
+ "learning_rate": 1.8005249343832024e-05,
145
+ "loss": 0.9006,
146
+ "step": 1900
147
+ },
148
+ {
149
+ "epoch": 1.05,
150
+ "learning_rate": 1.79002624671916e-05,
151
+ "loss": 0.8434,
152
+ "step": 2000
153
+ },
154
+ {
155
+ "epoch": 1.05,
156
+ "eval_loss": 0.855362594127655,
157
+ "eval_runtime": 59.1417,
158
+ "eval_samples_per_second": 238.968,
159
+ "eval_steps_per_second": 14.947,
160
+ "step": 2000
161
+ },
162
+ {
163
+ "epoch": 1.1,
164
+ "learning_rate": 1.779527559055118e-05,
165
+ "loss": 0.897,
166
+ "step": 2100
167
+ },
168
+ {
169
+ "epoch": 1.15,
170
+ "learning_rate": 1.7690288713910762e-05,
171
+ "loss": 0.9001,
172
+ "step": 2200
173
+ },
174
+ {
175
+ "epoch": 1.21,
176
+ "learning_rate": 1.7585301837270342e-05,
177
+ "loss": 0.8561,
178
+ "step": 2300
179
+ },
180
+ {
181
+ "epoch": 1.26,
182
+ "learning_rate": 1.7480314960629923e-05,
183
+ "loss": 0.8814,
184
+ "step": 2400
185
+ },
186
+ {
187
+ "epoch": 1.31,
188
+ "learning_rate": 1.7375328083989503e-05,
189
+ "loss": 0.84,
190
+ "step": 2500
191
+ },
192
+ {
193
+ "epoch": 1.31,
194
+ "eval_loss": 0.8605416417121887,
195
+ "eval_runtime": 58.7215,
196
+ "eval_samples_per_second": 240.679,
197
+ "eval_steps_per_second": 15.054,
198
+ "step": 2500
199
+ },
200
+ {
201
+ "epoch": 1.36,
202
+ "learning_rate": 1.7270341207349083e-05,
203
+ "loss": 0.8514,
204
+ "step": 2600
205
+ },
206
+ {
207
+ "epoch": 1.42,
208
+ "learning_rate": 1.7165354330708663e-05,
209
+ "loss": 0.8332,
210
+ "step": 2700
211
+ },
212
+ {
213
+ "epoch": 1.47,
214
+ "learning_rate": 1.7060367454068244e-05,
215
+ "loss": 0.8546,
216
+ "step": 2800
217
+ },
218
+ {
219
+ "epoch": 1.52,
220
+ "learning_rate": 1.695538057742782e-05,
221
+ "loss": 0.8452,
222
+ "step": 2900
223
+ },
224
+ {
225
+ "epoch": 1.57,
226
+ "learning_rate": 1.68503937007874e-05,
227
+ "loss": 0.8434,
228
+ "step": 3000
229
+ },
230
+ {
231
+ "epoch": 1.57,
232
+ "eval_loss": 0.8366661667823792,
233
+ "eval_runtime": 59.2216,
234
+ "eval_samples_per_second": 238.646,
235
+ "eval_steps_per_second": 14.927,
236
+ "step": 3000
237
+ },
238
+ {
239
+ "epoch": 1.63,
240
+ "learning_rate": 1.6745406824146985e-05,
241
+ "loss": 0.8221,
242
+ "step": 3100
243
+ },
244
+ {
245
+ "epoch": 1.68,
246
+ "learning_rate": 1.6640419947506562e-05,
247
+ "loss": 0.8596,
248
+ "step": 3200
249
+ },
250
+ {
251
+ "epoch": 1.73,
252
+ "learning_rate": 1.6535433070866142e-05,
253
+ "loss": 0.8533,
254
+ "step": 3300
255
+ },
256
+ {
257
+ "epoch": 1.78,
258
+ "learning_rate": 1.6430446194225722e-05,
259
+ "loss": 0.833,
260
+ "step": 3400
261
+ },
262
+ {
263
+ "epoch": 1.84,
264
+ "learning_rate": 1.6325459317585303e-05,
265
+ "loss": 0.8043,
266
+ "step": 3500
267
+ },
268
+ {
269
+ "epoch": 1.84,
270
+ "eval_loss": 0.8364191651344299,
271
+ "eval_runtime": 59.4819,
272
+ "eval_samples_per_second": 237.602,
273
+ "eval_steps_per_second": 14.862,
274
+ "step": 3500
275
+ },
276
+ {
277
+ "epoch": 1.89,
278
+ "learning_rate": 1.6220472440944883e-05,
279
+ "loss": 0.7932,
280
+ "step": 3600
281
+ },
282
+ {
283
+ "epoch": 1.94,
284
+ "learning_rate": 1.6115485564304463e-05,
285
+ "loss": 0.8283,
286
+ "step": 3700
287
+ },
288
+ {
289
+ "epoch": 1.99,
290
+ "learning_rate": 1.6010498687664044e-05,
291
+ "loss": 0.8467,
292
+ "step": 3800
293
+ },
294
+ {
295
+ "epoch": 2.05,
296
+ "learning_rate": 1.5905511811023624e-05,
297
+ "loss": 0.788,
298
+ "step": 3900
299
+ },
300
+ {
301
+ "epoch": 2.1,
302
+ "learning_rate": 1.5800524934383204e-05,
303
+ "loss": 0.761,
304
+ "step": 4000
305
+ },
306
+ {
307
+ "epoch": 2.1,
308
+ "eval_loss": 0.8275307416915894,
309
+ "eval_runtime": 58.9069,
310
+ "eval_samples_per_second": 239.921,
311
+ "eval_steps_per_second": 15.007,
312
+ "step": 4000
313
+ },
314
+ {
315
+ "epoch": 2.15,
316
+ "learning_rate": 1.5695538057742785e-05,
317
+ "loss": 0.7742,
318
+ "step": 4100
319
+ },
320
+ {
321
+ "epoch": 2.2,
322
+ "learning_rate": 1.559055118110236e-05,
323
+ "loss": 0.787,
324
+ "step": 4200
325
+ },
326
+ {
327
+ "epoch": 2.26,
328
+ "learning_rate": 1.5485564304461945e-05,
329
+ "loss": 0.7856,
330
+ "step": 4300
331
+ },
332
+ {
333
+ "epoch": 2.31,
334
+ "learning_rate": 1.5380577427821522e-05,
335
+ "loss": 0.8009,
336
+ "step": 4400
337
+ },
338
+ {
339
+ "epoch": 2.36,
340
+ "learning_rate": 1.5275590551181102e-05,
341
+ "loss": 0.7899,
342
+ "step": 4500
343
+ },
344
+ {
345
+ "epoch": 2.36,
346
+ "eval_loss": 0.854367733001709,
347
+ "eval_runtime": 58.9597,
348
+ "eval_samples_per_second": 239.706,
349
+ "eval_steps_per_second": 14.993,
350
+ "step": 4500
351
+ },
352
+ {
353
+ "epoch": 2.41,
354
+ "learning_rate": 1.5170603674540683e-05,
355
+ "loss": 0.8067,
356
+ "step": 4600
357
+ },
358
+ {
359
+ "epoch": 2.47,
360
+ "learning_rate": 1.5065616797900265e-05,
361
+ "loss": 0.7733,
362
+ "step": 4700
363
+ },
364
+ {
365
+ "epoch": 2.52,
366
+ "learning_rate": 1.4960629921259843e-05,
367
+ "loss": 0.7509,
368
+ "step": 4800
369
+ },
370
+ {
371
+ "epoch": 2.57,
372
+ "learning_rate": 1.4855643044619424e-05,
373
+ "loss": 0.7775,
374
+ "step": 4900
375
+ },
376
+ {
377
+ "epoch": 2.62,
378
+ "learning_rate": 1.4750656167979002e-05,
379
+ "loss": 0.7918,
380
+ "step": 5000
381
+ },
382
+ {
383
+ "epoch": 2.62,
384
+ "eval_loss": 0.8410710692405701,
385
+ "eval_runtime": 59.1309,
386
+ "eval_samples_per_second": 239.012,
387
+ "eval_steps_per_second": 14.95,
388
+ "step": 5000
389
+ },
390
+ {
391
+ "epoch": 2.68,
392
+ "learning_rate": 1.4645669291338584e-05,
393
+ "loss": 0.773,
394
+ "step": 5100
395
+ },
396
+ {
397
+ "epoch": 2.73,
398
+ "learning_rate": 1.4540682414698165e-05,
399
+ "loss": 0.7716,
400
+ "step": 5200
401
+ },
402
+ {
403
+ "epoch": 2.78,
404
+ "learning_rate": 1.4435695538057743e-05,
405
+ "loss": 0.7745,
406
+ "step": 5300
407
+ },
408
+ {
409
+ "epoch": 2.83,
410
+ "learning_rate": 1.4330708661417324e-05,
411
+ "loss": 0.7727,
412
+ "step": 5400
413
+ },
414
+ {
415
+ "epoch": 2.89,
416
+ "learning_rate": 1.4225721784776904e-05,
417
+ "loss": 0.7582,
418
+ "step": 5500
419
+ },
420
+ {
421
+ "epoch": 2.89,
422
+ "eval_loss": 0.8356336355209351,
423
+ "eval_runtime": 59.0698,
424
+ "eval_samples_per_second": 239.259,
425
+ "eval_steps_per_second": 14.965,
426
+ "step": 5500
427
+ },
428
+ {
429
+ "epoch": 2.94,
430
+ "learning_rate": 1.4120734908136484e-05,
431
+ "loss": 0.7688,
432
+ "step": 5600
433
+ },
434
+ {
435
+ "epoch": 2.99,
436
+ "learning_rate": 1.4015748031496063e-05,
437
+ "loss": 0.7512,
438
+ "step": 5700
439
+ },
440
+ {
441
+ "epoch": 3.04,
442
+ "learning_rate": 1.3910761154855643e-05,
443
+ "loss": 0.7398,
444
+ "step": 5800
445
+ },
446
+ {
447
+ "epoch": 3.1,
448
+ "learning_rate": 1.3805774278215225e-05,
449
+ "loss": 0.7089,
450
+ "step": 5900
451
+ },
452
+ {
453
+ "epoch": 3.15,
454
+ "learning_rate": 1.3700787401574804e-05,
455
+ "loss": 0.7462,
456
+ "step": 6000
457
+ },
458
+ {
459
+ "epoch": 3.15,
460
+ "eval_loss": 0.833259105682373,
461
+ "eval_runtime": 58.7017,
462
+ "eval_samples_per_second": 240.76,
463
+ "eval_steps_per_second": 15.059,
464
+ "step": 6000
465
+ },
466
+ {
467
+ "epoch": 3.2,
468
+ "learning_rate": 1.3595800524934384e-05,
469
+ "loss": 0.7512,
470
+ "step": 6100
471
+ },
472
+ {
473
+ "epoch": 3.25,
474
+ "learning_rate": 1.3490813648293963e-05,
475
+ "loss": 0.7273,
476
+ "step": 6200
477
+ },
478
+ {
479
+ "epoch": 3.31,
480
+ "learning_rate": 1.3385826771653545e-05,
481
+ "loss": 0.7368,
482
+ "step": 6300
483
+ },
484
+ {
485
+ "epoch": 3.36,
486
+ "learning_rate": 1.3280839895013125e-05,
487
+ "loss": 0.7253,
488
+ "step": 6400
489
+ },
490
+ {
491
+ "epoch": 3.41,
492
+ "learning_rate": 1.3175853018372704e-05,
493
+ "loss": 0.7325,
494
+ "step": 6500
495
+ },
496
+ {
497
+ "epoch": 3.41,
498
+ "eval_loss": 0.8515605926513672,
499
+ "eval_runtime": 59.4138,
500
+ "eval_samples_per_second": 237.874,
501
+ "eval_steps_per_second": 14.879,
502
+ "step": 6500
503
+ },
504
+ {
505
+ "epoch": 3.46,
506
+ "learning_rate": 1.3070866141732284e-05,
507
+ "loss": 0.7574,
508
+ "step": 6600
509
+ },
510
+ {
511
+ "epoch": 3.52,
512
+ "learning_rate": 1.2965879265091864e-05,
513
+ "loss": 0.7424,
514
+ "step": 6700
515
+ },
516
+ {
517
+ "epoch": 3.57,
518
+ "learning_rate": 1.2860892388451445e-05,
519
+ "loss": 0.7412,
520
+ "step": 6800
521
+ },
522
+ {
523
+ "epoch": 3.62,
524
+ "learning_rate": 1.2755905511811025e-05,
525
+ "loss": 0.7244,
526
+ "step": 6900
527
+ },
528
+ {
529
+ "epoch": 3.67,
530
+ "learning_rate": 1.2650918635170604e-05,
531
+ "loss": 0.7218,
532
+ "step": 7000
533
+ },
534
+ {
535
+ "epoch": 3.67,
536
+ "eval_loss": 0.8130525946617126,
537
+ "eval_runtime": 58.9752,
538
+ "eval_samples_per_second": 239.643,
539
+ "eval_steps_per_second": 14.989,
540
+ "step": 7000
541
+ },
542
+ {
543
+ "epoch": 3.73,
544
+ "learning_rate": 1.2545931758530186e-05,
545
+ "loss": 0.7289,
546
+ "step": 7100
547
+ },
548
+ {
549
+ "epoch": 3.78,
550
+ "learning_rate": 1.2440944881889764e-05,
551
+ "loss": 0.7444,
552
+ "step": 7200
553
+ },
554
+ {
555
+ "epoch": 3.83,
556
+ "learning_rate": 1.2335958005249345e-05,
557
+ "loss": 0.7574,
558
+ "step": 7300
559
+ },
560
+ {
561
+ "epoch": 3.88,
562
+ "learning_rate": 1.2230971128608923e-05,
563
+ "loss": 0.7504,
564
+ "step": 7400
565
+ },
566
+ {
567
+ "epoch": 3.94,
568
+ "learning_rate": 1.2125984251968505e-05,
569
+ "loss": 0.7344,
570
+ "step": 7500
571
+ },
572
+ {
573
+ "epoch": 3.94,
574
+ "eval_loss": 0.8257057666778564,
575
+ "eval_runtime": 59.7911,
576
+ "eval_samples_per_second": 236.373,
577
+ "eval_steps_per_second": 14.785,
578
+ "step": 7500
579
+ },
580
+ {
581
+ "epoch": 3.99,
582
+ "learning_rate": 1.2020997375328086e-05,
583
+ "loss": 0.7239,
584
+ "step": 7600
585
+ },
586
+ {
587
+ "epoch": 4.04,
588
+ "learning_rate": 1.1916010498687664e-05,
589
+ "loss": 0.7317,
590
+ "step": 7700
591
+ },
592
+ {
593
+ "epoch": 4.09,
594
+ "learning_rate": 1.1811023622047245e-05,
595
+ "loss": 0.7116,
596
+ "step": 7800
597
+ },
598
+ {
599
+ "epoch": 4.15,
600
+ "learning_rate": 1.1706036745406827e-05,
601
+ "loss": 0.7089,
602
+ "step": 7900
603
+ },
604
+ {
605
+ "epoch": 4.2,
606
+ "learning_rate": 1.1601049868766405e-05,
607
+ "loss": 0.7383,
608
+ "step": 8000
609
+ },
610
+ {
611
+ "epoch": 4.2,
612
+ "eval_loss": 0.8250786066055298,
613
+ "eval_runtime": 59.5177,
614
+ "eval_samples_per_second": 237.459,
615
+ "eval_steps_per_second": 14.853,
616
+ "step": 8000
617
+ },
618
+ {
619
+ "epoch": 4.25,
620
+ "learning_rate": 1.1496062992125985e-05,
621
+ "loss": 0.6883,
622
+ "step": 8100
623
+ },
624
+ {
625
+ "epoch": 4.3,
626
+ "learning_rate": 1.1391076115485564e-05,
627
+ "loss": 0.7232,
628
+ "step": 8200
629
+ },
630
+ {
631
+ "epoch": 4.36,
632
+ "learning_rate": 1.1286089238845146e-05,
633
+ "loss": 0.6993,
634
+ "step": 8300
635
+ },
636
+ {
637
+ "epoch": 4.41,
638
+ "learning_rate": 1.1181102362204725e-05,
639
+ "loss": 0.7232,
640
+ "step": 8400
641
+ },
642
+ {
643
+ "epoch": 4.46,
644
+ "learning_rate": 1.1076115485564305e-05,
645
+ "loss": 0.6693,
646
+ "step": 8500
647
+ },
648
+ {
649
+ "epoch": 4.46,
650
+ "eval_loss": 0.8354575037956238,
651
+ "eval_runtime": 59.8496,
652
+ "eval_samples_per_second": 236.142,
653
+ "eval_steps_per_second": 14.77,
654
+ "step": 8500
655
+ },
656
+ {
657
+ "epoch": 4.51,
658
+ "learning_rate": 1.0971128608923884e-05,
659
+ "loss": 0.6875,
660
+ "step": 8600
661
+ },
662
+ {
663
+ "epoch": 4.57,
664
+ "learning_rate": 1.0866141732283466e-05,
665
+ "loss": 0.6938,
666
+ "step": 8700
667
+ },
668
+ {
669
+ "epoch": 4.62,
670
+ "learning_rate": 1.0761154855643046e-05,
671
+ "loss": 0.7113,
672
+ "step": 8800
673
+ },
674
+ {
675
+ "epoch": 4.67,
676
+ "learning_rate": 1.0656167979002625e-05,
677
+ "loss": 0.6836,
678
+ "step": 8900
679
+ },
680
+ {
681
+ "epoch": 4.72,
682
+ "learning_rate": 1.0551181102362205e-05,
683
+ "loss": 0.7016,
684
+ "step": 9000
685
+ },
686
+ {
687
+ "epoch": 4.72,
688
+ "eval_loss": 0.8524064421653748,
689
+ "eval_runtime": 58.806,
690
+ "eval_samples_per_second": 240.333,
691
+ "eval_steps_per_second": 15.032,
692
+ "step": 9000
693
+ },
694
+ {
695
+ "epoch": 4.78,
696
+ "learning_rate": 1.0446194225721787e-05,
697
+ "loss": 0.7148,
698
+ "step": 9100
699
+ },
700
+ {
701
+ "epoch": 4.83,
702
+ "learning_rate": 1.0341207349081366e-05,
703
+ "loss": 0.7192,
704
+ "step": 9200
705
+ },
706
+ {
707
+ "epoch": 4.88,
708
+ "learning_rate": 1.0236220472440946e-05,
709
+ "loss": 0.7382,
710
+ "step": 9300
711
+ },
712
+ {
713
+ "epoch": 4.93,
714
+ "learning_rate": 1.0131233595800525e-05,
715
+ "loss": 0.7054,
716
+ "step": 9400
717
+ },
718
+ {
719
+ "epoch": 4.99,
720
+ "learning_rate": 1.0026246719160107e-05,
721
+ "loss": 0.7045,
722
+ "step": 9500
723
+ },
724
+ {
725
+ "epoch": 4.99,
726
+ "eval_loss": 0.825893223285675,
727
+ "eval_runtime": 58.4492,
728
+ "eval_samples_per_second": 241.8,
729
+ "eval_steps_per_second": 15.124,
730
+ "step": 9500
731
+ },
732
+ {
733
+ "epoch": 5.04,
734
+ "learning_rate": 9.921259842519685e-06,
735
+ "loss": 0.6797,
736
+ "step": 9600
737
+ },
738
+ {
739
+ "epoch": 5.09,
740
+ "learning_rate": 9.816272965879266e-06,
741
+ "loss": 0.6586,
742
+ "step": 9700
743
+ },
744
+ {
745
+ "epoch": 5.14,
746
+ "learning_rate": 9.711286089238846e-06,
747
+ "loss": 0.6772,
748
+ "step": 9800
749
+ },
750
+ {
751
+ "epoch": 5.2,
752
+ "learning_rate": 9.606299212598426e-06,
753
+ "loss": 0.6732,
754
+ "step": 9900
755
+ },
756
+ {
757
+ "epoch": 5.25,
758
+ "learning_rate": 9.501312335958006e-06,
759
+ "loss": 0.6653,
760
+ "step": 10000
761
+ },
762
+ {
763
+ "epoch": 5.25,
764
+ "eval_loss": 0.8119387626647949,
765
+ "eval_runtime": 59.2296,
766
+ "eval_samples_per_second": 238.614,
767
+ "eval_steps_per_second": 14.925,
768
+ "step": 10000
769
+ },
770
+ {
771
+ "epoch": 5.3,
772
+ "learning_rate": 9.396325459317585e-06,
773
+ "loss": 0.6568,
774
+ "step": 10100
775
+ },
776
+ {
777
+ "epoch": 5.35,
778
+ "learning_rate": 9.291338582677165e-06,
779
+ "loss": 0.6598,
780
+ "step": 10200
781
+ },
782
+ {
783
+ "epoch": 5.41,
784
+ "learning_rate": 9.186351706036746e-06,
785
+ "loss": 0.6752,
786
+ "step": 10300
787
+ },
788
+ {
789
+ "epoch": 5.46,
790
+ "learning_rate": 9.081364829396326e-06,
791
+ "loss": 0.6568,
792
+ "step": 10400
793
+ },
794
+ {
795
+ "epoch": 5.51,
796
+ "learning_rate": 8.976377952755906e-06,
797
+ "loss": 0.6642,
798
+ "step": 10500
799
+ },
800
+ {
801
+ "epoch": 5.51,
802
+ "eval_loss": 0.8364242911338806,
803
+ "eval_runtime": 59.3425,
804
+ "eval_samples_per_second": 238.16,
805
+ "eval_steps_per_second": 14.897,
806
+ "step": 10500
807
+ },
808
+ {
809
+ "epoch": 5.56,
810
+ "learning_rate": 8.871391076115487e-06,
811
+ "loss": 0.6672,
812
+ "step": 10600
813
+ },
814
+ {
815
+ "epoch": 5.62,
816
+ "learning_rate": 8.766404199475065e-06,
817
+ "loss": 0.6929,
818
+ "step": 10700
819
+ },
820
+ {
821
+ "epoch": 5.67,
822
+ "learning_rate": 8.661417322834647e-06,
823
+ "loss": 0.6719,
824
+ "step": 10800
825
+ },
826
+ {
827
+ "epoch": 5.72,
828
+ "learning_rate": 8.556430446194226e-06,
829
+ "loss": 0.6864,
830
+ "step": 10900
831
+ },
832
+ {
833
+ "epoch": 5.77,
834
+ "learning_rate": 8.451443569553806e-06,
835
+ "loss": 0.7038,
836
+ "step": 11000
837
+ },
838
+ {
839
+ "epoch": 5.77,
840
+ "eval_loss": 0.8415279984474182,
841
+ "eval_runtime": 60.2905,
842
+ "eval_samples_per_second": 234.415,
843
+ "eval_steps_per_second": 14.662,
844
+ "step": 11000
845
+ },
846
+ {
847
+ "epoch": 5.83,
848
+ "learning_rate": 8.346456692913387e-06,
849
+ "loss": 0.6463,
850
+ "step": 11100
851
+ },
852
+ {
853
+ "epoch": 5.88,
854
+ "learning_rate": 8.241469816272967e-06,
855
+ "loss": 0.6362,
856
+ "step": 11200
857
+ },
858
+ {
859
+ "epoch": 5.93,
860
+ "learning_rate": 8.136482939632546e-06,
861
+ "loss": 0.6833,
862
+ "step": 11300
863
+ },
864
+ {
865
+ "epoch": 5.98,
866
+ "learning_rate": 8.031496062992128e-06,
867
+ "loss": 0.6493,
868
+ "step": 11400
869
+ },
870
+ {
871
+ "epoch": 6.04,
872
+ "learning_rate": 7.926509186351706e-06,
873
+ "loss": 0.6482,
874
+ "step": 11500
875
+ },
876
+ {
877
+ "epoch": 6.04,
878
+ "eval_loss": 0.8445884585380554,
879
+ "eval_runtime": 60.3998,
880
+ "eval_samples_per_second": 233.991,
881
+ "eval_steps_per_second": 14.636,
882
+ "step": 11500
883
+ },
884
+ {
885
+ "epoch": 6.09,
886
+ "learning_rate": 7.821522309711287e-06,
887
+ "loss": 0.6318,
888
+ "step": 11600
889
+ },
890
+ {
891
+ "epoch": 6.14,
892
+ "learning_rate": 7.716535433070867e-06,
893
+ "loss": 0.6638,
894
+ "step": 11700
895
+ },
896
+ {
897
+ "epoch": 6.19,
898
+ "learning_rate": 7.611548556430447e-06,
899
+ "loss": 0.6958,
900
+ "step": 11800
901
+ },
902
+ {
903
+ "epoch": 6.25,
904
+ "learning_rate": 7.506561679790027e-06,
905
+ "loss": 0.6516,
906
+ "step": 11900
907
+ },
908
+ {
909
+ "epoch": 6.3,
910
+ "learning_rate": 7.401574803149607e-06,
911
+ "loss": 0.6401,
912
+ "step": 12000
913
+ },
914
+ {
915
+ "epoch": 6.3,
916
+ "eval_loss": 0.8119750022888184,
917
+ "eval_runtime": 58.8481,
918
+ "eval_samples_per_second": 240.161,
919
+ "eval_steps_per_second": 15.022,
920
+ "step": 12000
921
+ },
922
+ {
923
+ "epoch": 6.35,
924
+ "learning_rate": 7.2965879265091864e-06,
925
+ "loss": 0.6509,
926
+ "step": 12100
927
+ },
928
+ {
929
+ "epoch": 6.4,
930
+ "learning_rate": 7.191601049868768e-06,
931
+ "loss": 0.6664,
932
+ "step": 12200
933
+ },
934
+ {
935
+ "epoch": 6.46,
936
+ "learning_rate": 7.086614173228347e-06,
937
+ "loss": 0.6701,
938
+ "step": 12300
939
+ },
940
+ {
941
+ "epoch": 6.51,
942
+ "learning_rate": 6.981627296587927e-06,
943
+ "loss": 0.6489,
944
+ "step": 12400
945
+ },
946
+ {
947
+ "epoch": 6.56,
948
+ "learning_rate": 6.876640419947507e-06,
949
+ "loss": 0.6666,
950
+ "step": 12500
951
+ },
952
+ {
953
+ "epoch": 6.56,
954
+ "eval_loss": 0.8412150740623474,
955
+ "eval_runtime": 58.9938,
956
+ "eval_samples_per_second": 239.568,
957
+ "eval_steps_per_second": 14.985,
958
+ "step": 12500
959
+ },
960
+ {
961
+ "epoch": 6.61,
962
+ "learning_rate": 6.771653543307087e-06,
963
+ "loss": 0.6648,
964
+ "step": 12600
965
+ },
966
+ {
967
+ "epoch": 6.67,
968
+ "learning_rate": 6.666666666666667e-06,
969
+ "loss": 0.6549,
970
+ "step": 12700
971
+ },
972
+ {
973
+ "epoch": 6.72,
974
+ "learning_rate": 6.561679790026248e-06,
975
+ "loss": 0.6372,
976
+ "step": 12800
977
+ },
978
+ {
979
+ "epoch": 6.77,
980
+ "learning_rate": 6.456692913385827e-06,
981
+ "loss": 0.6321,
982
+ "step": 12900
983
+ },
984
+ {
985
+ "epoch": 6.82,
986
+ "learning_rate": 6.351706036745408e-06,
987
+ "loss": 0.6529,
988
+ "step": 13000
989
+ },
990
+ {
991
+ "epoch": 6.82,
992
+ "eval_loss": 0.8355860710144043,
993
+ "eval_runtime": 59.1629,
994
+ "eval_samples_per_second": 238.883,
995
+ "eval_steps_per_second": 14.942,
996
+ "step": 13000
997
+ },
998
+ {
999
+ "epoch": 6.88,
1000
+ "learning_rate": 6.246719160104987e-06,
1001
+ "loss": 0.6384,
1002
+ "step": 13100
1003
+ },
1004
+ {
1005
+ "epoch": 6.93,
1006
+ "learning_rate": 6.141732283464567e-06,
1007
+ "loss": 0.6409,
1008
+ "step": 13200
1009
+ },
1010
+ {
1011
+ "epoch": 6.98,
1012
+ "learning_rate": 6.036745406824147e-06,
1013
+ "loss": 0.6385,
1014
+ "step": 13300
1015
+ },
1016
+ {
1017
+ "epoch": 7.03,
1018
+ "learning_rate": 5.931758530183728e-06,
1019
+ "loss": 0.639,
1020
+ "step": 13400
1021
+ },
1022
+ {
1023
+ "epoch": 7.09,
1024
+ "learning_rate": 5.8267716535433075e-06,
1025
+ "loss": 0.6462,
1026
+ "step": 13500
1027
+ },
1028
+ {
1029
+ "epoch": 7.09,
1030
+ "eval_loss": 0.8167649507522583,
1031
+ "eval_runtime": 59.7264,
1032
+ "eval_samples_per_second": 236.629,
1033
+ "eval_steps_per_second": 14.801,
1034
+ "step": 13500
1035
+ },
1036
+ {
1037
+ "epoch": 7.14,
1038
+ "learning_rate": 5.721784776902888e-06,
1039
+ "loss": 0.6125,
1040
+ "step": 13600
1041
+ },
1042
+ {
1043
+ "epoch": 7.19,
1044
+ "learning_rate": 5.616797900262467e-06,
1045
+ "loss": 0.6371,
1046
+ "step": 13700
1047
+ },
1048
+ {
1049
+ "epoch": 7.24,
1050
+ "learning_rate": 5.511811023622048e-06,
1051
+ "loss": 0.6003,
1052
+ "step": 13800
1053
+ },
1054
+ {
1055
+ "epoch": 7.3,
1056
+ "learning_rate": 5.406824146981627e-06,
1057
+ "loss": 0.6322,
1058
+ "step": 13900
1059
+ },
1060
+ {
1061
+ "epoch": 7.35,
1062
+ "learning_rate": 5.301837270341208e-06,
1063
+ "loss": 0.6313,
1064
+ "step": 14000
1065
+ },
1066
+ {
1067
+ "epoch": 7.35,
1068
+ "eval_loss": 0.8006855249404907,
1069
+ "eval_runtime": 60.1264,
1070
+ "eval_samples_per_second": 235.055,
1071
+ "eval_steps_per_second": 14.702,
1072
+ "step": 14000
1073
+ },
1074
+ {
1075
+ "epoch": 7.4,
1076
+ "learning_rate": 5.196850393700788e-06,
1077
+ "loss": 0.6166,
1078
+ "step": 14100
1079
+ },
1080
+ {
1081
+ "epoch": 7.45,
1082
+ "learning_rate": 5.091863517060368e-06,
1083
+ "loss": 0.6192,
1084
+ "step": 14200
1085
+ },
1086
+ {
1087
+ "epoch": 7.51,
1088
+ "learning_rate": 4.986876640419948e-06,
1089
+ "loss": 0.648,
1090
+ "step": 14300
1091
+ },
1092
+ {
1093
+ "epoch": 7.56,
1094
+ "learning_rate": 4.881889763779528e-06,
1095
+ "loss": 0.6441,
1096
+ "step": 14400
1097
+ },
1098
+ {
1099
+ "epoch": 7.61,
1100
+ "learning_rate": 4.776902887139108e-06,
1101
+ "loss": 0.64,
1102
+ "step": 14500
1103
+ },
1104
+ {
1105
+ "epoch": 7.61,
1106
+ "eval_loss": 0.7937971949577332,
1107
+ "eval_runtime": 61.4082,
1108
+ "eval_samples_per_second": 230.149,
1109
+ "eval_steps_per_second": 14.395,
1110
+ "step": 14500
1111
+ }
1112
+ ],
1113
+ "max_steps": 19050,
1114
+ "num_train_epochs": 10,
1115
+ "total_flos": 5.068483991811235e+16,
1116
+ "trial_name": null,
1117
+ "trial_params": null
1118
+ }