wcosmas commited on
Commit
42b8067
1 Parent(s): ab25088

End of training

Browse files
README.md CHANGED
@@ -23,7 +23,7 @@ model-index:
23
  metrics:
24
  - name: Accuracy
25
  type: accuracy
26
- value: 0.9191176470588235
27
  ---
28
 
29
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -33,8 +33,8 @@ should probably proofread and complete it, then remove this comment. -->
33
 
34
  This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the imagefolder dataset.
35
  It achieves the following results on the evaluation set:
36
- - Loss: 0.2707
37
- - Accuracy: 0.9191
38
 
39
  ## Model description
40
 
 
23
  metrics:
24
  - name: Accuracy
25
  type: accuracy
26
+ value: 0.9338235294117647
27
  ---
28
 
29
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
33
 
34
  This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the imagefolder dataset.
35
  It achieves the following results on the evaluation set:
36
+ - Loss: 0.2825
37
+ - Accuracy: 0.9338
38
 
39
  ## Model description
40
 
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 50.0,
3
- "eval_accuracy": 0.9731993299832495,
4
- "eval_loss": 0.10922118276357651,
5
- "eval_runtime": 99.3108,
6
- "eval_samples_per_second": 6.011,
7
- "eval_steps_per_second": 0.191,
8
- "total_flos": 2.0803097508518707e+19,
9
- "train_loss": 0.1354110169055916,
10
- "train_runtime": 52600.7464,
11
- "train_samples_per_second": 5.104,
12
- "train_steps_per_second": 0.04
13
  }
 
1
  {
2
+ "epoch": 46.15384615384615,
3
+ "eval_accuracy": 0.9338235294117647,
4
+ "eval_loss": 0.28249993920326233,
5
+ "eval_runtime": 30.22,
6
+ "eval_samples_per_second": 4.5,
7
+ "eval_steps_per_second": 0.165,
8
+ "total_flos": 4.3781443993328026e+18,
9
+ "train_loss": 0.4078153912226359,
10
+ "train_runtime": 14896.2203,
11
+ "train_samples_per_second": 4.108,
12
+ "train_steps_per_second": 0.03
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 50.0,
3
- "eval_accuracy": 0.9731993299832495,
4
- "eval_loss": 0.10922118276357651,
5
- "eval_runtime": 99.3108,
6
- "eval_samples_per_second": 6.011,
7
- "eval_steps_per_second": 0.191
8
  }
 
1
  {
2
+ "epoch": 46.15384615384615,
3
+ "eval_accuracy": 0.9338235294117647,
4
+ "eval_loss": 0.28249993920326233,
5
+ "eval_runtime": 30.22,
6
+ "eval_samples_per_second": 4.5,
7
+ "eval_steps_per_second": 0.165
8
  }
runs/Oct22_18-03-39_549dd11dc1c2/events.out.tfevents.1729635164.549dd11dc1c2.1065.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89307b63cf5306a373f9652323bdbf974157f375f17284cd6596c0e85eda5fdb
3
+ size 411
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 50.0,
3
- "total_flos": 2.0803097508518707e+19,
4
- "train_loss": 0.1354110169055916,
5
- "train_runtime": 52600.7464,
6
- "train_samples_per_second": 5.104,
7
- "train_steps_per_second": 0.04
8
  }
 
1
  {
2
+ "epoch": 46.15384615384615,
3
+ "total_flos": 4.3781443993328026e+18,
4
+ "train_loss": 0.4078153912226359,
5
+ "train_runtime": 14896.2203,
6
+ "train_samples_per_second": 4.108,
7
+ "train_steps_per_second": 0.03
8
  }
trainer_state.json CHANGED
@@ -1,1945 +1,763 @@
1
  {
2
- "best_metric": 0.9731993299832495,
3
- "best_model_checkpoint": "vit-base-patch16-224-in21k-finetuned-papsmear/checkpoint-1680",
4
- "epoch": 50.0,
5
  "eval_steps": 500,
6
- "global_step": 2100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.23809523809523808,
13
- "grad_norm": 1.9611310958862305,
14
- "learning_rate": 2.3809523809523808e-06,
15
- "loss": 1.4059,
16
- "step": 10
17
- },
18
- {
19
- "epoch": 0.47619047619047616,
20
- "grad_norm": 1.8307807445526123,
21
- "learning_rate": 4.7619047619047615e-06,
22
- "loss": 1.3406,
23
- "step": 20
24
- },
25
- {
26
- "epoch": 0.7142857142857143,
27
- "grad_norm": 1.8257180452346802,
28
- "learning_rate": 7.142857142857143e-06,
29
- "loss": 1.2533,
30
- "step": 30
31
- },
32
- {
33
- "epoch": 0.9523809523809523,
34
- "grad_norm": 1.2919795513153076,
35
- "learning_rate": 9.523809523809523e-06,
36
- "loss": 1.1553,
37
- "step": 40
38
- },
39
- {
40
- "epoch": 1.0,
41
- "eval_accuracy": 0.5477386934673367,
42
- "eval_loss": 1.0950442552566528,
43
- "eval_runtime": 104.5,
44
- "eval_samples_per_second": 5.713,
45
- "eval_steps_per_second": 0.182,
46
- "step": 42
47
- },
48
- {
49
- "epoch": 1.1904761904761905,
50
- "grad_norm": 0.9992073178291321,
51
- "learning_rate": 1.1904761904761905e-05,
52
- "loss": 1.0508,
53
- "step": 50
54
  },
55
  {
56
- "epoch": 1.4285714285714286,
57
- "grad_norm": 0.8946818113327026,
58
- "learning_rate": 1.4285714285714285e-05,
59
- "loss": 0.952,
60
- "step": 60
61
- },
62
- {
63
- "epoch": 1.6666666666666665,
64
- "grad_norm": 1.0831001996994019,
65
- "learning_rate": 1.6666666666666667e-05,
66
- "loss": 0.8539,
67
- "step": 70
68
- },
69
- {
70
- "epoch": 1.9047619047619047,
71
- "grad_norm": 1.271881103515625,
72
- "learning_rate": 1.9047619047619046e-05,
73
- "loss": 0.7791,
74
- "step": 80
75
- },
76
- {
77
- "epoch": 2.0,
78
- "eval_accuracy": 0.8525963149078727,
79
- "eval_loss": 0.6485655307769775,
80
- "eval_runtime": 107.1905,
81
- "eval_samples_per_second": 5.57,
82
- "eval_steps_per_second": 0.177,
83
- "step": 84
84
- },
85
- {
86
- "epoch": 2.142857142857143,
87
- "grad_norm": 1.7645951509475708,
88
- "learning_rate": 2.1428571428571428e-05,
89
- "loss": 0.6112,
90
- "step": 90
91
- },
92
- {
93
- "epoch": 2.380952380952381,
94
- "grad_norm": 2.271763563156128,
95
- "learning_rate": 2.380952380952381e-05,
96
- "loss": 0.5029,
97
- "step": 100
98
- },
99
- {
100
- "epoch": 2.619047619047619,
101
- "grad_norm": 2.248612403869629,
102
- "learning_rate": 2.6190476190476192e-05,
103
- "loss": 0.4809,
104
- "step": 110
105
  },
106
  {
107
- "epoch": 2.857142857142857,
108
- "grad_norm": 2.1449034214019775,
109
- "learning_rate": 2.857142857142857e-05,
110
- "loss": 0.433,
111
- "step": 120
 
 
112
  },
113
  {
114
- "epoch": 3.0,
115
- "eval_accuracy": 0.9128978224455612,
116
- "eval_loss": 0.3716076910495758,
117
- "eval_runtime": 107.5442,
118
- "eval_samples_per_second": 5.551,
119
- "eval_steps_per_second": 0.177,
120
- "step": 126
121
  },
122
  {
123
- "epoch": 3.0952380952380953,
124
- "grad_norm": 2.3766279220581055,
125
- "learning_rate": 3.095238095238095e-05,
126
- "loss": 0.3777,
127
- "step": 130
 
 
128
  },
129
  {
130
- "epoch": 3.3333333333333335,
131
- "grad_norm": 1.3414802551269531,
132
  "learning_rate": 3.3333333333333335e-05,
133
- "loss": 0.3424,
134
- "step": 140
135
- },
136
- {
137
- "epoch": 3.571428571428571,
138
- "grad_norm": 7.162674903869629,
139
- "learning_rate": 3.571428571428572e-05,
140
- "loss": 0.3719,
141
- "step": 150
142
- },
143
- {
144
- "epoch": 3.8095238095238093,
145
- "grad_norm": 2.170807123184204,
146
- "learning_rate": 3.809523809523809e-05,
147
- "loss": 0.3495,
148
- "step": 160
149
  },
150
  {
151
  "epoch": 4.0,
152
- "eval_accuracy": 0.9346733668341709,
153
- "eval_loss": 0.28690266609191895,
154
- "eval_runtime": 106.4197,
155
- "eval_samples_per_second": 5.61,
156
- "eval_steps_per_second": 0.179,
157
- "step": 168
158
- },
159
- {
160
- "epoch": 4.0476190476190474,
161
- "grad_norm": 1.2484022378921509,
162
- "learning_rate": 4.047619047619048e-05,
163
- "loss": 0.2892,
164
- "step": 170
165
- },
166
- {
167
- "epoch": 4.285714285714286,
168
- "grad_norm": 7.085641860961914,
169
- "learning_rate": 4.2857142857142856e-05,
170
- "loss": 0.3386,
171
- "step": 180
172
- },
173
- {
174
- "epoch": 4.523809523809524,
175
- "grad_norm": 1.4148013591766357,
176
- "learning_rate": 4.523809523809524e-05,
177
- "loss": 0.3225,
178
- "step": 190
179
- },
180
- {
181
- "epoch": 4.761904761904762,
182
- "grad_norm": 2.7628612518310547,
183
- "learning_rate": 4.761904761904762e-05,
184
- "loss": 0.2979,
185
- "step": 200
186
- },
187
- {
188
- "epoch": 5.0,
189
- "grad_norm": 1.5106964111328125,
190
- "learning_rate": 5e-05,
191
- "loss": 0.2556,
192
- "step": 210
193
- },
194
- {
195
- "epoch": 5.0,
196
- "eval_accuracy": 0.9279731993299832,
197
- "eval_loss": 0.27220866084098816,
198
- "eval_runtime": 106.2256,
199
- "eval_samples_per_second": 5.62,
200
- "eval_steps_per_second": 0.179,
201
- "step": 210
202
- },
203
- {
204
- "epoch": 5.238095238095238,
205
- "grad_norm": 4.107568740844727,
206
- "learning_rate": 4.973544973544973e-05,
207
- "loss": 0.2635,
208
- "step": 220
209
  },
210
  {
211
- "epoch": 5.476190476190476,
212
- "grad_norm": 2.1257033348083496,
213
- "learning_rate": 4.9470899470899475e-05,
214
- "loss": 0.2581,
215
- "step": 230
216
- },
217
- {
218
- "epoch": 5.714285714285714,
219
- "grad_norm": 2.4218671321868896,
220
- "learning_rate": 4.9206349206349204e-05,
221
- "loss": 0.2303,
222
- "step": 240
223
- },
224
- {
225
- "epoch": 5.9523809523809526,
226
- "grad_norm": 2.0036144256591797,
227
- "learning_rate": 4.894179894179895e-05,
228
- "loss": 0.2791,
229
- "step": 250
230
  },
231
  {
232
- "epoch": 6.0,
233
- "eval_accuracy": 0.932998324958124,
234
- "eval_loss": 0.2610774338245392,
235
- "eval_runtime": 106.375,
236
- "eval_samples_per_second": 5.612,
237
- "eval_steps_per_second": 0.179,
238
- "step": 252
239
  },
240
  {
241
- "epoch": 6.190476190476191,
242
- "grad_norm": 2.8978521823883057,
243
- "learning_rate": 4.8677248677248676e-05,
244
- "loss": 0.224,
245
- "step": 260
246
  },
247
  {
248
- "epoch": 6.428571428571429,
249
- "grad_norm": 1.3699370622634888,
250
- "learning_rate": 4.841269841269841e-05,
251
- "loss": 0.2208,
252
- "step": 270
 
 
253
  },
254
  {
255
- "epoch": 6.666666666666667,
256
- "grad_norm": 1.7077748775482178,
257
  "learning_rate": 4.814814814814815e-05,
258
- "loss": 0.2294,
259
- "step": 280
260
- },
261
- {
262
- "epoch": 6.904761904761905,
263
- "grad_norm": 2.22580885887146,
264
- "learning_rate": 4.7883597883597884e-05,
265
- "loss": 0.2343,
266
- "step": 290
267
- },
268
- {
269
- "epoch": 7.0,
270
- "eval_accuracy": 0.9380234505862647,
271
- "eval_loss": 0.2376711070537567,
272
- "eval_runtime": 104.2609,
273
- "eval_samples_per_second": 5.726,
274
- "eval_steps_per_second": 0.182,
275
- "step": 294
276
- },
277
- {
278
- "epoch": 7.142857142857143,
279
- "grad_norm": 1.8247556686401367,
280
- "learning_rate": 4.761904761904762e-05,
281
- "loss": 0.2295,
282
- "step": 300
283
- },
284
- {
285
- "epoch": 7.380952380952381,
286
- "grad_norm": 2.389512538909912,
287
- "learning_rate": 4.7354497354497356e-05,
288
- "loss": 0.2059,
289
- "step": 310
290
  },
291
  {
292
- "epoch": 7.619047619047619,
293
- "grad_norm": 1.6845765113830566,
294
- "learning_rate": 4.708994708994709e-05,
295
- "loss": 0.1938,
296
- "step": 320
 
 
297
  },
298
  {
299
- "epoch": 7.857142857142857,
300
- "grad_norm": 1.345927119255066,
301
- "learning_rate": 4.682539682539683e-05,
302
- "loss": 0.186,
303
- "step": 330
304
  },
305
  {
306
  "epoch": 8.0,
307
- "eval_accuracy": 0.9396984924623115,
308
- "eval_loss": 0.2157616764307022,
309
- "eval_runtime": 104.0067,
310
- "eval_samples_per_second": 5.74,
311
- "eval_steps_per_second": 0.183,
312
- "step": 336
313
- },
314
- {
315
- "epoch": 8.095238095238095,
316
- "grad_norm": 1.470657467842102,
317
- "learning_rate": 4.656084656084656e-05,
318
- "loss": 0.2077,
319
- "step": 340
320
- },
321
- {
322
- "epoch": 8.333333333333334,
323
- "grad_norm": 1.4709361791610718,
324
- "learning_rate": 4.62962962962963e-05,
325
- "loss": 0.1809,
326
- "step": 350
327
- },
328
- {
329
- "epoch": 8.571428571428571,
330
- "grad_norm": 1.731719732284546,
331
- "learning_rate": 4.603174603174603e-05,
332
- "loss": 0.2163,
333
- "step": 360
334
- },
335
- {
336
- "epoch": 8.80952380952381,
337
- "grad_norm": 1.8033312559127808,
338
- "learning_rate": 4.576719576719577e-05,
339
- "loss": 0.1984,
340
- "step": 370
341
- },
342
- {
343
- "epoch": 9.0,
344
- "eval_accuracy": 0.9346733668341709,
345
- "eval_loss": 0.22224725782871246,
346
- "eval_runtime": 104.8934,
347
- "eval_samples_per_second": 5.691,
348
- "eval_steps_per_second": 0.181,
349
- "step": 378
350
- },
351
- {
352
- "epoch": 9.047619047619047,
353
- "grad_norm": 1.9741649627685547,
354
- "learning_rate": 4.55026455026455e-05,
355
- "loss": 0.2093,
356
- "step": 380
357
- },
358
- {
359
- "epoch": 9.285714285714286,
360
- "grad_norm": 1.6360487937927246,
361
- "learning_rate": 4.523809523809524e-05,
362
- "loss": 0.1586,
363
- "step": 390
364
- },
365
- {
366
- "epoch": 9.523809523809524,
367
- "grad_norm": 2.772472858428955,
368
- "learning_rate": 4.4973544973544974e-05,
369
- "loss": 0.1423,
370
- "step": 400
371
  },
372
  {
373
- "epoch": 9.761904761904763,
374
- "grad_norm": 1.9369553327560425,
375
- "learning_rate": 4.470899470899471e-05,
376
- "loss": 0.1458,
377
- "step": 410
 
 
378
  },
379
  {
380
- "epoch": 10.0,
381
- "grad_norm": 2.552593231201172,
382
  "learning_rate": 4.4444444444444447e-05,
383
- "loss": 0.1751,
384
- "step": 420
385
- },
386
- {
387
- "epoch": 10.0,
388
- "eval_accuracy": 0.9514237855946399,
389
- "eval_loss": 0.19929908215999603,
390
- "eval_runtime": 106.1844,
391
- "eval_samples_per_second": 5.622,
392
- "eval_steps_per_second": 0.179,
393
- "step": 420
394
- },
395
- {
396
- "epoch": 10.238095238095237,
397
- "grad_norm": 2.9462811946868896,
398
- "learning_rate": 4.417989417989418e-05,
399
- "loss": 0.1305,
400
- "step": 430
401
- },
402
- {
403
- "epoch": 10.476190476190476,
404
- "grad_norm": 2.5522372722625732,
405
- "learning_rate": 4.391534391534391e-05,
406
- "loss": 0.1555,
407
- "step": 440
408
- },
409
- {
410
- "epoch": 10.714285714285714,
411
- "grad_norm": 1.2257236242294312,
412
- "learning_rate": 4.3650793650793655e-05,
413
- "loss": 0.1575,
414
- "step": 450
415
- },
416
- {
417
- "epoch": 10.952380952380953,
418
- "grad_norm": 0.9242783188819885,
419
- "learning_rate": 4.3386243386243384e-05,
420
- "loss": 0.1529,
421
- "step": 460
422
- },
423
- {
424
- "epoch": 11.0,
425
- "eval_accuracy": 0.9430485762144054,
426
- "eval_loss": 0.2100822627544403,
427
- "eval_runtime": 102.6058,
428
- "eval_samples_per_second": 5.818,
429
- "eval_steps_per_second": 0.185,
430
- "step": 462
431
  },
432
  {
433
- "epoch": 11.19047619047619,
434
- "grad_norm": 1.2224242687225342,
435
- "learning_rate": 4.312169312169313e-05,
436
- "loss": 0.1171,
437
- "step": 470
 
 
438
  },
439
  {
440
- "epoch": 11.428571428571429,
441
- "grad_norm": 1.332038164138794,
442
- "learning_rate": 4.2857142857142856e-05,
443
- "loss": 0.1399,
444
- "step": 480
445
  },
446
  {
447
- "epoch": 11.666666666666666,
448
- "grad_norm": 2.5912814140319824,
449
- "learning_rate": 4.259259259259259e-05,
450
- "loss": 0.129,
451
- "step": 490
 
 
452
  },
453
  {
454
- "epoch": 11.904761904761905,
455
- "grad_norm": 3.0691795349121094,
456
- "learning_rate": 4.232804232804233e-05,
457
- "loss": 0.1616,
458
- "step": 500
459
  },
460
  {
461
  "epoch": 12.0,
462
- "eval_accuracy": 0.9296482412060302,
463
- "eval_loss": 0.2542562484741211,
464
- "eval_runtime": 107.198,
465
- "eval_samples_per_second": 5.569,
466
- "eval_steps_per_second": 0.177,
467
- "step": 504
468
- },
469
- {
470
- "epoch": 12.142857142857142,
471
- "grad_norm": 1.015551209449768,
472
- "learning_rate": 4.2063492063492065e-05,
473
- "loss": 0.1374,
474
- "step": 510
475
- },
476
- {
477
- "epoch": 12.380952380952381,
478
- "grad_norm": 1.5744130611419678,
479
- "learning_rate": 4.17989417989418e-05,
480
- "loss": 0.1219,
481
- "step": 520
482
- },
483
- {
484
- "epoch": 12.619047619047619,
485
- "grad_norm": 2.7706940174102783,
486
- "learning_rate": 4.153439153439154e-05,
487
- "loss": 0.1159,
488
- "step": 530
489
- },
490
- {
491
- "epoch": 12.857142857142858,
492
- "grad_norm": 0.9542893767356873,
493
- "learning_rate": 4.126984126984127e-05,
494
- "loss": 0.1404,
495
- "step": 540
496
- },
497
- {
498
- "epoch": 13.0,
499
- "eval_accuracy": 0.9396984924623115,
500
- "eval_loss": 0.2028820812702179,
501
- "eval_runtime": 106.2276,
502
- "eval_samples_per_second": 5.62,
503
- "eval_steps_per_second": 0.179,
504
- "step": 546
505
  },
506
  {
507
- "epoch": 13.095238095238095,
508
- "grad_norm": 1.9553595781326294,
509
- "learning_rate": 4.100529100529101e-05,
510
- "loss": 0.1128,
511
- "step": 550
512
- },
513
- {
514
- "epoch": 13.333333333333334,
515
- "grad_norm": 0.4872543513774872,
516
  "learning_rate": 4.074074074074074e-05,
517
- "loss": 0.1133,
518
- "step": 560
519
- },
520
- {
521
- "epoch": 13.571428571428571,
522
- "grad_norm": 1.1375516653060913,
523
- "learning_rate": 4.047619047619048e-05,
524
- "loss": 0.1195,
525
- "step": 570
526
- },
527
- {
528
- "epoch": 13.80952380952381,
529
- "grad_norm": 2.1001851558685303,
530
- "learning_rate": 4.021164021164021e-05,
531
- "loss": 0.1078,
532
- "step": 580
533
- },
534
- {
535
- "epoch": 14.0,
536
- "eval_accuracy": 0.9413735343383585,
537
- "eval_loss": 0.20870448648929596,
538
- "eval_runtime": 103.1036,
539
- "eval_samples_per_second": 5.79,
540
- "eval_steps_per_second": 0.184,
541
- "step": 588
542
- },
543
- {
544
- "epoch": 14.047619047619047,
545
- "grad_norm": 1.273701786994934,
546
- "learning_rate": 3.9947089947089946e-05,
547
- "loss": 0.1078,
548
- "step": 590
549
- },
550
- {
551
- "epoch": 14.285714285714286,
552
- "grad_norm": 2.2141387462615967,
553
- "learning_rate": 3.968253968253968e-05,
554
- "loss": 0.1005,
555
- "step": 600
556
- },
557
- {
558
- "epoch": 14.523809523809524,
559
- "grad_norm": 2.147643566131592,
560
- "learning_rate": 3.941798941798942e-05,
561
- "loss": 0.1195,
562
- "step": 610
563
- },
564
- {
565
- "epoch": 14.761904761904763,
566
- "grad_norm": 1.8408890962600708,
567
- "learning_rate": 3.9153439153439155e-05,
568
- "loss": 0.0981,
569
- "step": 620
570
- },
571
- {
572
- "epoch": 15.0,
573
- "grad_norm": 2.7218682765960693,
574
- "learning_rate": 3.888888888888889e-05,
575
- "loss": 0.1109,
576
- "step": 630
577
- },
578
- {
579
- "epoch": 15.0,
580
- "eval_accuracy": 0.9614740368509213,
581
- "eval_loss": 0.1381397545337677,
582
- "eval_runtime": 102.743,
583
- "eval_samples_per_second": 5.811,
584
- "eval_steps_per_second": 0.185,
585
- "step": 630
586
- },
587
- {
588
- "epoch": 15.238095238095237,
589
- "grad_norm": 2.087874412536621,
590
- "learning_rate": 3.862433862433863e-05,
591
- "loss": 0.0932,
592
- "step": 640
593
- },
594
- {
595
- "epoch": 15.476190476190476,
596
- "grad_norm": 1.6582282781600952,
597
- "learning_rate": 3.835978835978836e-05,
598
- "loss": 0.12,
599
- "step": 650
600
  },
601
  {
602
- "epoch": 15.714285714285714,
603
- "grad_norm": 1.8671879768371582,
604
- "learning_rate": 3.809523809523809e-05,
605
- "loss": 0.0981,
606
- "step": 660
 
 
607
  },
608
  {
609
- "epoch": 15.952380952380953,
610
- "grad_norm": 1.7882401943206787,
611
- "learning_rate": 3.7830687830687835e-05,
612
- "loss": 0.1072,
613
- "step": 670
614
  },
615
  {
616
- "epoch": 16.0,
617
- "eval_accuracy": 0.9413735343383585,
618
- "eval_loss": 0.18952256441116333,
619
- "eval_runtime": 104.7109,
620
- "eval_samples_per_second": 5.701,
621
- "eval_steps_per_second": 0.181,
622
- "step": 672
623
  },
624
  {
625
- "epoch": 16.19047619047619,
626
- "grad_norm": 3.6218953132629395,
627
- "learning_rate": 3.7566137566137564e-05,
628
- "loss": 0.0818,
629
- "step": 680
630
  },
631
  {
632
- "epoch": 16.428571428571427,
633
- "grad_norm": 1.1185909509658813,
634
- "learning_rate": 3.730158730158731e-05,
635
- "loss": 0.0951,
636
- "step": 690
 
 
637
  },
638
  {
639
- "epoch": 16.666666666666668,
640
- "grad_norm": 0.9545117616653442,
641
  "learning_rate": 3.7037037037037037e-05,
642
- "loss": 0.1064,
643
- "step": 700
644
- },
645
- {
646
- "epoch": 16.904761904761905,
647
- "grad_norm": 0.7912140488624573,
648
- "learning_rate": 3.677248677248677e-05,
649
- "loss": 0.0949,
650
- "step": 710
651
- },
652
- {
653
- "epoch": 17.0,
654
- "eval_accuracy": 0.9396984924623115,
655
- "eval_loss": 0.19812369346618652,
656
- "eval_runtime": 103.2395,
657
- "eval_samples_per_second": 5.783,
658
- "eval_steps_per_second": 0.184,
659
- "step": 714
660
- },
661
- {
662
- "epoch": 17.142857142857142,
663
- "grad_norm": 2.5102455615997314,
664
- "learning_rate": 3.650793650793651e-05,
665
- "loss": 0.0942,
666
- "step": 720
667
- },
668
- {
669
- "epoch": 17.38095238095238,
670
- "grad_norm": 0.907739520072937,
671
- "learning_rate": 3.6243386243386245e-05,
672
- "loss": 0.081,
673
- "step": 730
674
- },
675
- {
676
- "epoch": 17.61904761904762,
677
- "grad_norm": 0.9580628275871277,
678
- "learning_rate": 3.597883597883598e-05,
679
- "loss": 0.0939,
680
- "step": 740
681
- },
682
- {
683
- "epoch": 17.857142857142858,
684
- "grad_norm": 0.4892265498638153,
685
- "learning_rate": 3.571428571428572e-05,
686
- "loss": 0.0908,
687
- "step": 750
688
- },
689
- {
690
- "epoch": 18.0,
691
- "eval_accuracy": 0.9581239530988275,
692
- "eval_loss": 0.16083765029907227,
693
- "eval_runtime": 103.0208,
694
- "eval_samples_per_second": 5.795,
695
- "eval_steps_per_second": 0.184,
696
- "step": 756
697
- },
698
- {
699
- "epoch": 18.095238095238095,
700
- "grad_norm": 0.6354324221611023,
701
- "learning_rate": 3.5449735449735446e-05,
702
- "loss": 0.0674,
703
- "step": 760
704
- },
705
- {
706
- "epoch": 18.333333333333332,
707
- "grad_norm": 1.7310459613800049,
708
- "learning_rate": 3.518518518518519e-05,
709
- "loss": 0.0963,
710
- "step": 770
711
- },
712
- {
713
- "epoch": 18.571428571428573,
714
- "grad_norm": 1.0339725017547607,
715
- "learning_rate": 3.492063492063492e-05,
716
- "loss": 0.0846,
717
- "step": 780
718
- },
719
- {
720
- "epoch": 18.80952380952381,
721
- "grad_norm": 2.478813648223877,
722
- "learning_rate": 3.465608465608466e-05,
723
- "loss": 0.0809,
724
- "step": 790
725
- },
726
- {
727
- "epoch": 19.0,
728
- "eval_accuracy": 0.9581239530988275,
729
- "eval_loss": 0.1764398217201233,
730
- "eval_runtime": 104.4696,
731
- "eval_samples_per_second": 5.715,
732
- "eval_steps_per_second": 0.182,
733
- "step": 798
734
  },
735
  {
736
- "epoch": 19.047619047619047,
737
- "grad_norm": 0.9190245270729065,
738
- "learning_rate": 3.439153439153439e-05,
739
- "loss": 0.0831,
740
- "step": 800
 
 
 
 
 
 
 
 
 
741
  },
742
  {
743
- "epoch": 19.285714285714285,
744
- "grad_norm": 1.1679134368896484,
745
- "learning_rate": 3.412698412698413e-05,
746
- "loss": 0.0737,
747
- "step": 810
 
 
748
  },
749
  {
750
- "epoch": 19.523809523809526,
751
- "grad_norm": 1.52895987033844,
752
- "learning_rate": 3.386243386243386e-05,
753
- "loss": 0.0695,
754
- "step": 820
755
  },
756
  {
757
- "epoch": 19.761904761904763,
758
- "grad_norm": 1.6142102479934692,
759
- "learning_rate": 3.35978835978836e-05,
760
- "loss": 0.0696,
761
- "step": 830
 
 
762
  },
763
  {
764
- "epoch": 20.0,
765
- "grad_norm": 2.2493298053741455,
766
  "learning_rate": 3.3333333333333335e-05,
767
- "loss": 0.0708,
768
- "step": 840
769
- },
770
- {
771
- "epoch": 20.0,
772
- "eval_accuracy": 0.9530988274706867,
773
- "eval_loss": 0.15123647451400757,
774
- "eval_runtime": 102.2975,
775
- "eval_samples_per_second": 5.836,
776
- "eval_steps_per_second": 0.186,
777
- "step": 840
778
- },
779
- {
780
- "epoch": 20.238095238095237,
781
- "grad_norm": 4.499787330627441,
782
- "learning_rate": 3.306878306878307e-05,
783
- "loss": 0.0936,
784
- "step": 850
785
- },
786
- {
787
- "epoch": 20.476190476190474,
788
- "grad_norm": 5.423216819763184,
789
- "learning_rate": 3.280423280423281e-05,
790
- "loss": 0.0712,
791
- "step": 860
792
- },
793
- {
794
- "epoch": 20.714285714285715,
795
- "grad_norm": 1.0831531286239624,
796
- "learning_rate": 3.253968253968254e-05,
797
- "loss": 0.0817,
798
- "step": 870
799
- },
800
- {
801
- "epoch": 20.952380952380953,
802
- "grad_norm": 1.6317639350891113,
803
- "learning_rate": 3.227513227513227e-05,
804
- "loss": 0.0757,
805
- "step": 880
806
- },
807
- {
808
- "epoch": 21.0,
809
- "eval_accuracy": 0.948073701842546,
810
- "eval_loss": 0.20271137356758118,
811
- "eval_runtime": 102.4721,
812
- "eval_samples_per_second": 5.826,
813
- "eval_steps_per_second": 0.185,
814
- "step": 882
815
- },
816
- {
817
- "epoch": 21.19047619047619,
818
- "grad_norm": 2.1182172298431396,
819
- "learning_rate": 3.2010582010582015e-05,
820
- "loss": 0.0882,
821
- "step": 890
822
- },
823
- {
824
- "epoch": 21.428571428571427,
825
- "grad_norm": 0.5835541486740112,
826
- "learning_rate": 3.1746031746031745e-05,
827
- "loss": 0.0607,
828
- "step": 900
829
- },
830
- {
831
- "epoch": 21.666666666666668,
832
- "grad_norm": 1.441300392150879,
833
- "learning_rate": 3.148148148148148e-05,
834
- "loss": 0.0859,
835
- "step": 910
836
- },
837
- {
838
- "epoch": 21.904761904761905,
839
- "grad_norm": 0.6337174773216248,
840
- "learning_rate": 3.121693121693122e-05,
841
- "loss": 0.0919,
842
- "step": 920
843
- },
844
- {
845
- "epoch": 22.0,
846
- "eval_accuracy": 0.9614740368509213,
847
- "eval_loss": 0.14867298305034637,
848
- "eval_runtime": 103.3409,
849
- "eval_samples_per_second": 5.777,
850
- "eval_steps_per_second": 0.184,
851
- "step": 924
852
- },
853
- {
854
- "epoch": 22.142857142857142,
855
- "grad_norm": 1.7783087491989136,
856
- "learning_rate": 3.095238095238095e-05,
857
- "loss": 0.0591,
858
- "step": 930
859
- },
860
- {
861
- "epoch": 22.38095238095238,
862
- "grad_norm": 0.4258907735347748,
863
- "learning_rate": 3.068783068783069e-05,
864
- "loss": 0.0722,
865
- "step": 940
866
  },
867
  {
868
- "epoch": 22.61904761904762,
869
- "grad_norm": 0.975234866142273,
870
- "learning_rate": 3.0423280423280425e-05,
871
- "loss": 0.0582,
872
- "step": 950
 
 
873
  },
874
  {
875
- "epoch": 22.857142857142858,
876
- "grad_norm": 0.9665831327438354,
877
- "learning_rate": 3.0158730158730158e-05,
878
- "loss": 0.07,
879
- "step": 960
880
  },
881
  {
882
- "epoch": 23.0,
883
- "eval_accuracy": 0.9614740368509213,
884
- "eval_loss": 0.16668196022510529,
885
- "eval_runtime": 103.2831,
886
- "eval_samples_per_second": 5.78,
887
- "eval_steps_per_second": 0.184,
888
- "step": 966
 
 
 
 
 
 
 
889
  },
890
  {
891
- "epoch": 23.095238095238095,
892
- "grad_norm": 1.7929288148880005,
893
- "learning_rate": 2.9894179894179897e-05,
894
- "loss": 0.0582,
895
- "step": 970
 
 
896
  },
897
  {
898
- "epoch": 23.333333333333332,
899
- "grad_norm": 0.43466824293136597,
900
  "learning_rate": 2.962962962962963e-05,
901
- "loss": 0.061,
902
- "step": 980
903
- },
904
- {
905
- "epoch": 23.571428571428573,
906
- "grad_norm": 2.2438175678253174,
907
- "learning_rate": 2.9365079365079366e-05,
908
- "loss": 0.0644,
909
- "step": 990
910
- },
911
- {
912
- "epoch": 23.80952380952381,
913
- "grad_norm": 0.8747345805168152,
914
- "learning_rate": 2.91005291005291e-05,
915
- "loss": 0.0629,
916
- "step": 1000
917
- },
918
- {
919
- "epoch": 24.0,
920
- "eval_accuracy": 0.9530988274706867,
921
- "eval_loss": 0.19044645130634308,
922
- "eval_runtime": 103.4446,
923
- "eval_samples_per_second": 5.771,
924
- "eval_steps_per_second": 0.184,
925
- "step": 1008
926
- },
927
- {
928
- "epoch": 24.047619047619047,
929
- "grad_norm": 1.7004871368408203,
930
- "learning_rate": 2.8835978835978838e-05,
931
- "loss": 0.0496,
932
- "step": 1010
933
- },
934
- {
935
- "epoch": 24.285714285714285,
936
- "grad_norm": 1.3886641263961792,
937
- "learning_rate": 2.857142857142857e-05,
938
- "loss": 0.0468,
939
- "step": 1020
940
- },
941
- {
942
- "epoch": 24.523809523809526,
943
- "grad_norm": 2.0595200061798096,
944
- "learning_rate": 2.830687830687831e-05,
945
- "loss": 0.0657,
946
- "step": 1030
947
- },
948
- {
949
- "epoch": 24.761904761904763,
950
- "grad_norm": 0.9633322954177856,
951
- "learning_rate": 2.8042328042328043e-05,
952
- "loss": 0.0626,
953
- "step": 1040
954
- },
955
- {
956
- "epoch": 25.0,
957
- "grad_norm": 0.629084050655365,
958
- "learning_rate": 2.777777777777778e-05,
959
- "loss": 0.0584,
960
- "step": 1050
961
- },
962
- {
963
- "epoch": 25.0,
964
- "eval_accuracy": 0.9631490787269682,
965
- "eval_loss": 0.15212486684322357,
966
- "eval_runtime": 104.2052,
967
- "eval_samples_per_second": 5.729,
968
- "eval_steps_per_second": 0.182,
969
- "step": 1050
970
- },
971
- {
972
- "epoch": 25.238095238095237,
973
- "grad_norm": 1.9510831832885742,
974
- "learning_rate": 2.7513227513227512e-05,
975
- "loss": 0.0514,
976
- "step": 1060
977
- },
978
- {
979
- "epoch": 25.476190476190474,
980
- "grad_norm": 0.6461337208747864,
981
- "learning_rate": 2.724867724867725e-05,
982
- "loss": 0.0619,
983
- "step": 1070
984
  },
985
  {
986
- "epoch": 25.714285714285715,
987
- "grad_norm": 0.8791431784629822,
988
- "learning_rate": 2.6984126984126984e-05,
989
- "loss": 0.0626,
990
- "step": 1080
 
 
991
  },
992
  {
993
- "epoch": 25.952380952380953,
994
- "grad_norm": 1.83372163772583,
995
- "learning_rate": 2.6719576719576723e-05,
996
- "loss": 0.0666,
997
- "step": 1090
998
  },
999
  {
1000
- "epoch": 26.0,
1001
- "eval_accuracy": 0.966499162479062,
1002
- "eval_loss": 0.1326070874929428,
1003
- "eval_runtime": 103.6677,
1004
- "eval_samples_per_second": 5.759,
1005
- "eval_steps_per_second": 0.183,
1006
- "step": 1092
1007
  },
1008
  {
1009
- "epoch": 26.19047619047619,
1010
- "grad_norm": 3.1547927856445312,
1011
- "learning_rate": 2.6455026455026456e-05,
1012
- "loss": 0.0607,
1013
- "step": 1100
1014
  },
1015
  {
1016
- "epoch": 26.428571428571427,
1017
- "grad_norm": 0.8336120247840881,
1018
- "learning_rate": 2.6190476190476192e-05,
1019
- "loss": 0.0458,
1020
- "step": 1110
 
 
1021
  },
1022
  {
1023
- "epoch": 26.666666666666668,
1024
- "grad_norm": 0.5386803150177002,
1025
  "learning_rate": 2.5925925925925925e-05,
1026
- "loss": 0.0638,
1027
- "step": 1120
1028
- },
1029
- {
1030
- "epoch": 26.904761904761905,
1031
- "grad_norm": 0.8411057591438293,
1032
- "learning_rate": 2.5661375661375664e-05,
1033
- "loss": 0.062,
1034
- "step": 1130
1035
- },
1036
- {
1037
- "epoch": 27.0,
1038
- "eval_accuracy": 0.9564489112227805,
1039
- "eval_loss": 0.17715045809745789,
1040
- "eval_runtime": 102.5942,
1041
- "eval_samples_per_second": 5.819,
1042
- "eval_steps_per_second": 0.185,
1043
- "step": 1134
1044
- },
1045
- {
1046
- "epoch": 27.142857142857142,
1047
- "grad_norm": 1.1352622509002686,
1048
- "learning_rate": 2.5396825396825397e-05,
1049
- "loss": 0.0396,
1050
- "step": 1140
1051
- },
1052
- {
1053
- "epoch": 27.38095238095238,
1054
- "grad_norm": 1.9047770500183105,
1055
- "learning_rate": 2.5132275132275137e-05,
1056
- "loss": 0.0383,
1057
- "step": 1150
1058
- },
1059
- {
1060
- "epoch": 27.61904761904762,
1061
- "grad_norm": 2.154599666595459,
1062
- "learning_rate": 2.4867724867724866e-05,
1063
- "loss": 0.0728,
1064
- "step": 1160
1065
- },
1066
- {
1067
- "epoch": 27.857142857142858,
1068
- "grad_norm": 1.5056850910186768,
1069
- "learning_rate": 2.4603174603174602e-05,
1070
- "loss": 0.0568,
1071
- "step": 1170
1072
- },
1073
- {
1074
- "epoch": 28.0,
1075
- "eval_accuracy": 0.9564489112227805,
1076
- "eval_loss": 0.14654366672039032,
1077
- "eval_runtime": 103.1379,
1078
- "eval_samples_per_second": 5.788,
1079
- "eval_steps_per_second": 0.184,
1080
- "step": 1176
1081
- },
1082
- {
1083
- "epoch": 28.095238095238095,
1084
- "grad_norm": 1.634865641593933,
1085
- "learning_rate": 2.4338624338624338e-05,
1086
- "loss": 0.0663,
1087
- "step": 1180
1088
- },
1089
- {
1090
- "epoch": 28.333333333333332,
1091
- "grad_norm": 1.265386939048767,
1092
- "learning_rate": 2.4074074074074074e-05,
1093
- "loss": 0.0487,
1094
- "step": 1190
1095
- },
1096
- {
1097
- "epoch": 28.571428571428573,
1098
- "grad_norm": 0.6159355044364929,
1099
- "learning_rate": 2.380952380952381e-05,
1100
- "loss": 0.0596,
1101
- "step": 1200
1102
- },
1103
- {
1104
- "epoch": 28.80952380952381,
1105
- "grad_norm": 1.0339206457138062,
1106
- "learning_rate": 2.3544973544973546e-05,
1107
- "loss": 0.0453,
1108
- "step": 1210
1109
  },
1110
  {
1111
- "epoch": 29.0,
1112
- "eval_accuracy": 0.9681742043551089,
1113
- "eval_loss": 0.13472113013267517,
1114
- "eval_runtime": 103.1249,
1115
- "eval_samples_per_second": 5.789,
1116
- "eval_steps_per_second": 0.184,
1117
- "step": 1218
1118
  },
1119
  {
1120
- "epoch": 29.047619047619047,
1121
- "grad_norm": 0.8177947998046875,
1122
- "learning_rate": 2.328042328042328e-05,
1123
- "loss": 0.055,
1124
- "step": 1220
1125
  },
1126
  {
1127
- "epoch": 29.285714285714285,
1128
- "grad_norm": 1.7629382610321045,
1129
- "learning_rate": 2.3015873015873015e-05,
1130
- "loss": 0.0476,
1131
- "step": 1230
 
 
1132
  },
1133
  {
1134
- "epoch": 29.523809523809526,
1135
- "grad_norm": 1.8531335592269897,
1136
- "learning_rate": 2.275132275132275e-05,
1137
- "loss": 0.0431,
1138
- "step": 1240
1139
  },
1140
  {
1141
- "epoch": 29.761904761904763,
1142
- "grad_norm": 0.961283802986145,
1143
- "learning_rate": 2.2486772486772487e-05,
1144
- "loss": 0.0579,
1145
- "step": 1250
 
 
1146
  },
1147
  {
1148
- "epoch": 30.0,
1149
- "grad_norm": 0.18748821318149567,
1150
  "learning_rate": 2.2222222222222223e-05,
1151
- "loss": 0.0469,
1152
- "step": 1260
1153
- },
1154
- {
1155
- "epoch": 30.0,
1156
- "eval_accuracy": 0.9631490787269682,
1157
- "eval_loss": 0.16871798038482666,
1158
- "eval_runtime": 105.6517,
1159
- "eval_samples_per_second": 5.651,
1160
- "eval_steps_per_second": 0.18,
1161
- "step": 1260
1162
- },
1163
- {
1164
- "epoch": 30.238095238095237,
1165
- "grad_norm": 0.8756251931190491,
1166
- "learning_rate": 2.1957671957671956e-05,
1167
- "loss": 0.0536,
1168
- "step": 1270
1169
- },
1170
- {
1171
- "epoch": 30.476190476190474,
1172
- "grad_norm": 0.7314756512641907,
1173
- "learning_rate": 2.1693121693121692e-05,
1174
- "loss": 0.0394,
1175
- "step": 1280
1176
- },
1177
- {
1178
- "epoch": 30.714285714285715,
1179
- "grad_norm": 1.9777828454971313,
1180
- "learning_rate": 2.1428571428571428e-05,
1181
- "loss": 0.0346,
1182
- "step": 1290
1183
- },
1184
- {
1185
- "epoch": 30.952380952380953,
1186
- "grad_norm": 1.4753316640853882,
1187
- "learning_rate": 2.1164021164021164e-05,
1188
- "loss": 0.0541,
1189
- "step": 1300
1190
- },
1191
- {
1192
- "epoch": 31.0,
1193
- "eval_accuracy": 0.9715242881072027,
1194
- "eval_loss": 0.13902144134044647,
1195
- "eval_runtime": 104.4849,
1196
- "eval_samples_per_second": 5.714,
1197
- "eval_steps_per_second": 0.182,
1198
- "step": 1302
1199
- },
1200
- {
1201
- "epoch": 31.19047619047619,
1202
- "grad_norm": 1.706081748008728,
1203
- "learning_rate": 2.08994708994709e-05,
1204
- "loss": 0.0613,
1205
- "step": 1310
1206
- },
1207
- {
1208
- "epoch": 31.428571428571427,
1209
- "grad_norm": 0.12419818341732025,
1210
- "learning_rate": 2.0634920634920636e-05,
1211
- "loss": 0.075,
1212
- "step": 1320
1213
- },
1214
- {
1215
- "epoch": 31.666666666666668,
1216
- "grad_norm": 0.7071540951728821,
1217
- "learning_rate": 2.037037037037037e-05,
1218
- "loss": 0.0468,
1219
- "step": 1330
1220
- },
1221
- {
1222
- "epoch": 31.904761904761905,
1223
- "grad_norm": 0.10454891622066498,
1224
- "learning_rate": 2.0105820105820105e-05,
1225
- "loss": 0.0602,
1226
- "step": 1340
1227
- },
1228
- {
1229
- "epoch": 32.0,
1230
- "eval_accuracy": 0.9614740368509213,
1231
- "eval_loss": 0.16181902587413788,
1232
- "eval_runtime": 102.8803,
1233
- "eval_samples_per_second": 5.803,
1234
- "eval_steps_per_second": 0.185,
1235
- "step": 1344
1236
- },
1237
- {
1238
- "epoch": 32.142857142857146,
1239
- "grad_norm": 0.773094654083252,
1240
- "learning_rate": 1.984126984126984e-05,
1241
- "loss": 0.0555,
1242
- "step": 1350
1243
- },
1244
- {
1245
- "epoch": 32.38095238095238,
1246
- "grad_norm": 1.865349292755127,
1247
- "learning_rate": 1.9576719576719577e-05,
1248
- "loss": 0.0518,
1249
- "step": 1360
1250
  },
1251
  {
1252
- "epoch": 32.61904761904762,
1253
- "grad_norm": 0.4416976571083069,
1254
- "learning_rate": 1.9312169312169313e-05,
1255
- "loss": 0.049,
1256
- "step": 1370
 
 
 
 
 
 
 
 
 
1257
  },
1258
  {
1259
- "epoch": 32.857142857142854,
1260
- "grad_norm": 2.0832931995391846,
1261
- "learning_rate": 1.9047619047619046e-05,
1262
- "loss": 0.0497,
1263
- "step": 1380
 
 
1264
  },
1265
  {
1266
- "epoch": 33.0,
1267
- "eval_accuracy": 0.9614740368509213,
1268
- "eval_loss": 0.1414780616760254,
1269
- "eval_runtime": 103.4955,
1270
- "eval_samples_per_second": 5.768,
1271
- "eval_steps_per_second": 0.184,
1272
- "step": 1386
1273
  },
1274
  {
1275
- "epoch": 33.095238095238095,
1276
- "grad_norm": 0.9570621848106384,
1277
- "learning_rate": 1.8783068783068782e-05,
1278
- "loss": 0.0532,
1279
- "step": 1390
 
 
1280
  },
1281
  {
1282
- "epoch": 33.333333333333336,
1283
- "grad_norm": 1.0575621128082275,
1284
  "learning_rate": 1.8518518518518518e-05,
1285
- "loss": 0.0555,
1286
- "step": 1400
1287
- },
1288
- {
1289
- "epoch": 33.57142857142857,
1290
- "grad_norm": 0.6880443096160889,
1291
- "learning_rate": 1.8253968253968254e-05,
1292
- "loss": 0.0454,
1293
- "step": 1410
1294
- },
1295
- {
1296
- "epoch": 33.80952380952381,
1297
- "grad_norm": 1.0119032859802246,
1298
- "learning_rate": 1.798941798941799e-05,
1299
- "loss": 0.0493,
1300
- "step": 1420
1301
- },
1302
- {
1303
- "epoch": 34.0,
1304
- "eval_accuracy": 0.9631490787269682,
1305
- "eval_loss": 0.1520875245332718,
1306
- "eval_runtime": 104.0599,
1307
- "eval_samples_per_second": 5.737,
1308
- "eval_steps_per_second": 0.183,
1309
- "step": 1428
1310
- },
1311
- {
1312
- "epoch": 34.04761904761905,
1313
- "grad_norm": 1.220492959022522,
1314
- "learning_rate": 1.7724867724867723e-05,
1315
- "loss": 0.0361,
1316
- "step": 1430
1317
- },
1318
- {
1319
- "epoch": 34.285714285714285,
1320
- "grad_norm": 0.5547053217887878,
1321
- "learning_rate": 1.746031746031746e-05,
1322
- "loss": 0.0412,
1323
- "step": 1440
1324
- },
1325
- {
1326
- "epoch": 34.523809523809526,
1327
- "grad_norm": 0.7015855312347412,
1328
- "learning_rate": 1.7195767195767195e-05,
1329
- "loss": 0.0425,
1330
- "step": 1450
1331
- },
1332
- {
1333
- "epoch": 34.76190476190476,
1334
- "grad_norm": 1.388316035270691,
1335
- "learning_rate": 1.693121693121693e-05,
1336
- "loss": 0.0342,
1337
- "step": 1460
1338
- },
1339
- {
1340
- "epoch": 35.0,
1341
- "grad_norm": 1.6578996181488037,
1342
- "learning_rate": 1.6666666666666667e-05,
1343
- "loss": 0.0606,
1344
- "step": 1470
1345
- },
1346
- {
1347
- "epoch": 35.0,
1348
- "eval_accuracy": 0.9698492462311558,
1349
- "eval_loss": 0.14287406206130981,
1350
- "eval_runtime": 103.938,
1351
- "eval_samples_per_second": 5.744,
1352
- "eval_steps_per_second": 0.183,
1353
- "step": 1470
1354
- },
1355
- {
1356
- "epoch": 35.23809523809524,
1357
- "grad_norm": 1.2321009635925293,
1358
- "learning_rate": 1.6402116402116404e-05,
1359
- "loss": 0.036,
1360
- "step": 1480
1361
- },
1362
- {
1363
- "epoch": 35.476190476190474,
1364
- "grad_norm": 1.9735788106918335,
1365
- "learning_rate": 1.6137566137566136e-05,
1366
- "loss": 0.0485,
1367
- "step": 1490
1368
- },
1369
- {
1370
- "epoch": 35.714285714285715,
1371
- "grad_norm": 0.7221766114234924,
1372
- "learning_rate": 1.5873015873015872e-05,
1373
- "loss": 0.0518,
1374
- "step": 1500
1375
  },
1376
  {
1377
- "epoch": 35.95238095238095,
1378
- "grad_norm": 1.484800100326538,
1379
- "learning_rate": 1.560846560846561e-05,
1380
- "loss": 0.0332,
1381
- "step": 1510
 
 
1382
  },
1383
  {
1384
- "epoch": 36.0,
1385
- "eval_accuracy": 0.964824120603015,
1386
- "eval_loss": 0.16714587807655334,
1387
- "eval_runtime": 103.1161,
1388
- "eval_samples_per_second": 5.79,
1389
- "eval_steps_per_second": 0.184,
1390
- "step": 1512
1391
  },
1392
  {
1393
- "epoch": 36.19047619047619,
1394
- "grad_norm": 2.1778924465179443,
1395
- "learning_rate": 1.5343915343915344e-05,
1396
- "loss": 0.058,
1397
- "step": 1520
 
 
 
 
 
 
 
 
 
1398
  },
1399
  {
1400
- "epoch": 36.42857142857143,
1401
- "grad_norm": 0.6789590120315552,
1402
- "learning_rate": 1.5079365079365079e-05,
1403
- "loss": 0.0341,
1404
- "step": 1530
 
 
1405
  },
1406
  {
1407
- "epoch": 36.666666666666664,
1408
- "grad_norm": 1.6879972219467163,
1409
  "learning_rate": 1.4814814814814815e-05,
1410
- "loss": 0.0395,
1411
- "step": 1540
1412
- },
1413
- {
1414
- "epoch": 36.904761904761905,
1415
- "grad_norm": 0.9731617569923401,
1416
- "learning_rate": 1.455026455026455e-05,
1417
- "loss": 0.0432,
1418
- "step": 1550
1419
- },
1420
- {
1421
- "epoch": 37.0,
1422
- "eval_accuracy": 0.966499162479062,
1423
- "eval_loss": 0.14411257207393646,
1424
- "eval_runtime": 103.3621,
1425
- "eval_samples_per_second": 5.776,
1426
- "eval_steps_per_second": 0.184,
1427
- "step": 1554
1428
- },
1429
- {
1430
- "epoch": 37.142857142857146,
1431
- "grad_norm": 1.3034260272979736,
1432
- "learning_rate": 1.4285714285714285e-05,
1433
- "loss": 0.0376,
1434
- "step": 1560
1435
- },
1436
- {
1437
- "epoch": 37.38095238095238,
1438
- "grad_norm": 0.8573827743530273,
1439
- "learning_rate": 1.4021164021164022e-05,
1440
- "loss": 0.0475,
1441
- "step": 1570
1442
- },
1443
- {
1444
- "epoch": 37.61904761904762,
1445
- "grad_norm": 0.7534766793251038,
1446
- "learning_rate": 1.3756613756613756e-05,
1447
- "loss": 0.0519,
1448
- "step": 1580
1449
- },
1450
- {
1451
- "epoch": 37.857142857142854,
1452
- "grad_norm": 0.7222949266433716,
1453
- "learning_rate": 1.3492063492063492e-05,
1454
- "loss": 0.0354,
1455
- "step": 1590
1456
- },
1457
- {
1458
- "epoch": 38.0,
1459
- "eval_accuracy": 0.9681742043551089,
1460
- "eval_loss": 0.15929608047008514,
1461
- "eval_runtime": 107.453,
1462
- "eval_samples_per_second": 5.556,
1463
- "eval_steps_per_second": 0.177,
1464
- "step": 1596
1465
- },
1466
- {
1467
- "epoch": 38.095238095238095,
1468
- "grad_norm": 1.8834507465362549,
1469
- "learning_rate": 1.3227513227513228e-05,
1470
- "loss": 0.0338,
1471
- "step": 1600
1472
- },
1473
- {
1474
- "epoch": 38.333333333333336,
1475
- "grad_norm": 0.6104734539985657,
1476
- "learning_rate": 1.2962962962962962e-05,
1477
- "loss": 0.0462,
1478
- "step": 1610
1479
- },
1480
- {
1481
- "epoch": 38.57142857142857,
1482
- "grad_norm": 0.7713958024978638,
1483
- "learning_rate": 1.2698412698412699e-05,
1484
- "loss": 0.0279,
1485
- "step": 1620
1486
- },
1487
- {
1488
- "epoch": 38.80952380952381,
1489
- "grad_norm": 0.2185550034046173,
1490
- "learning_rate": 1.2433862433862433e-05,
1491
- "loss": 0.0432,
1492
- "step": 1630
1493
  },
1494
  {
1495
- "epoch": 39.0,
1496
- "eval_accuracy": 0.966499162479062,
1497
- "eval_loss": 0.13952085375785828,
1498
- "eval_runtime": 107.8325,
1499
- "eval_samples_per_second": 5.536,
1500
- "eval_steps_per_second": 0.176,
1501
- "step": 1638
1502
  },
1503
  {
1504
- "epoch": 39.04761904761905,
1505
- "grad_norm": 1.2039453983306885,
1506
- "learning_rate": 1.2169312169312169e-05,
1507
- "loss": 0.0451,
1508
- "step": 1640
1509
  },
1510
  {
1511
- "epoch": 39.285714285714285,
1512
- "grad_norm": 2.02799391746521,
1513
- "learning_rate": 1.1904761904761905e-05,
1514
- "loss": 0.0249,
1515
- "step": 1650
 
 
1516
  },
1517
  {
1518
- "epoch": 39.523809523809526,
1519
- "grad_norm": 0.5916322469711304,
1520
- "learning_rate": 1.164021164021164e-05,
1521
- "loss": 0.0408,
1522
- "step": 1660
1523
  },
1524
  {
1525
- "epoch": 39.76190476190476,
1526
- "grad_norm": 2.4064204692840576,
1527
- "learning_rate": 1.1375661375661376e-05,
1528
- "loss": 0.0439,
1529
- "step": 1670
 
 
1530
  },
1531
  {
1532
- "epoch": 40.0,
1533
- "grad_norm": 0.05854567885398865,
1534
  "learning_rate": 1.1111111111111112e-05,
1535
- "loss": 0.0363,
1536
- "step": 1680
1537
- },
1538
- {
1539
- "epoch": 40.0,
1540
- "eval_accuracy": 0.9731993299832495,
1541
- "eval_loss": 0.10922118276357651,
1542
- "eval_runtime": 107.5538,
1543
- "eval_samples_per_second": 5.551,
1544
- "eval_steps_per_second": 0.177,
1545
- "step": 1680
1546
- },
1547
- {
1548
- "epoch": 40.23809523809524,
1549
- "grad_norm": 1.7765388488769531,
1550
- "learning_rate": 1.0846560846560846e-05,
1551
- "loss": 0.0337,
1552
- "step": 1690
1553
- },
1554
- {
1555
- "epoch": 40.476190476190474,
1556
- "grad_norm": 1.7078924179077148,
1557
- "learning_rate": 1.0582010582010582e-05,
1558
- "loss": 0.0272,
1559
- "step": 1700
1560
- },
1561
- {
1562
- "epoch": 40.714285714285715,
1563
- "grad_norm": 1.3066785335540771,
1564
- "learning_rate": 1.0317460317460318e-05,
1565
- "loss": 0.0408,
1566
- "step": 1710
1567
- },
1568
- {
1569
- "epoch": 40.95238095238095,
1570
- "grad_norm": 0.261738121509552,
1571
- "learning_rate": 1.0052910052910053e-05,
1572
- "loss": 0.0288,
1573
- "step": 1720
1574
- },
1575
- {
1576
- "epoch": 41.0,
1577
- "eval_accuracy": 0.966499162479062,
1578
- "eval_loss": 0.15500447154045105,
1579
- "eval_runtime": 107.8053,
1580
- "eval_samples_per_second": 5.538,
1581
- "eval_steps_per_second": 0.176,
1582
- "step": 1722
1583
- },
1584
- {
1585
- "epoch": 41.19047619047619,
1586
- "grad_norm": 0.7850339412689209,
1587
- "learning_rate": 9.788359788359789e-06,
1588
- "loss": 0.031,
1589
- "step": 1730
1590
- },
1591
- {
1592
- "epoch": 41.42857142857143,
1593
- "grad_norm": 1.2644939422607422,
1594
- "learning_rate": 9.523809523809523e-06,
1595
- "loss": 0.0435,
1596
- "step": 1740
1597
- },
1598
- {
1599
- "epoch": 41.666666666666664,
1600
- "grad_norm": 1.832721471786499,
1601
- "learning_rate": 9.259259259259259e-06,
1602
- "loss": 0.0275,
1603
- "step": 1750
1604
- },
1605
- {
1606
- "epoch": 41.904761904761905,
1607
- "grad_norm": 0.4885420501232147,
1608
- "learning_rate": 8.994708994708995e-06,
1609
- "loss": 0.0305,
1610
- "step": 1760
1611
- },
1612
- {
1613
- "epoch": 42.0,
1614
- "eval_accuracy": 0.9681742043551089,
1615
- "eval_loss": 0.14619530737400055,
1616
- "eval_runtime": 107.0057,
1617
- "eval_samples_per_second": 5.579,
1618
- "eval_steps_per_second": 0.178,
1619
- "step": 1764
1620
- },
1621
- {
1622
- "epoch": 42.142857142857146,
1623
- "grad_norm": 1.3336294889450073,
1624
- "learning_rate": 8.73015873015873e-06,
1625
- "loss": 0.0442,
1626
- "step": 1770
1627
  },
1628
  {
1629
- "epoch": 42.38095238095238,
1630
- "grad_norm": 1.4869195222854614,
1631
- "learning_rate": 8.465608465608466e-06,
1632
- "loss": 0.0367,
1633
- "step": 1780
 
 
1634
  },
1635
  {
1636
- "epoch": 42.61904761904762,
1637
- "grad_norm": 0.8099101781845093,
1638
- "learning_rate": 8.201058201058202e-06,
1639
- "loss": 0.0457,
1640
- "step": 1790
1641
  },
1642
  {
1643
- "epoch": 42.857142857142854,
1644
- "grad_norm": 0.36065608263015747,
1645
- "learning_rate": 7.936507936507936e-06,
1646
- "loss": 0.0326,
1647
- "step": 1800
 
 
1648
  },
1649
  {
1650
- "epoch": 43.0,
1651
- "eval_accuracy": 0.9681742043551089,
1652
- "eval_loss": 0.1343374401330948,
1653
- "eval_runtime": 107.1742,
1654
- "eval_samples_per_second": 5.57,
1655
- "eval_steps_per_second": 0.177,
1656
- "step": 1806
1657
  },
1658
  {
1659
- "epoch": 43.095238095238095,
1660
- "grad_norm": 0.7654176354408264,
1661
- "learning_rate": 7.671957671957672e-06,
1662
- "loss": 0.0182,
1663
- "step": 1810
 
 
1664
  },
1665
  {
1666
- "epoch": 43.333333333333336,
1667
- "grad_norm": 0.9363707900047302,
1668
  "learning_rate": 7.4074074074074075e-06,
1669
- "loss": 0.0364,
1670
- "step": 1820
1671
- },
1672
- {
1673
- "epoch": 43.57142857142857,
1674
- "grad_norm": 1.6939359903335571,
1675
- "learning_rate": 7.142857142857143e-06,
1676
- "loss": 0.0274,
1677
- "step": 1830
1678
- },
1679
- {
1680
- "epoch": 43.80952380952381,
1681
- "grad_norm": 0.285452663898468,
1682
- "learning_rate": 6.878306878306878e-06,
1683
- "loss": 0.027,
1684
- "step": 1840
1685
- },
1686
- {
1687
- "epoch": 44.0,
1688
- "eval_accuracy": 0.9731993299832495,
1689
- "eval_loss": 0.11093433946371078,
1690
- "eval_runtime": 107.916,
1691
- "eval_samples_per_second": 5.532,
1692
- "eval_steps_per_second": 0.176,
1693
- "step": 1848
1694
- },
1695
- {
1696
- "epoch": 44.04761904761905,
1697
- "grad_norm": 1.4939181804656982,
1698
- "learning_rate": 6.613756613756614e-06,
1699
- "loss": 0.035,
1700
- "step": 1850
1701
- },
1702
- {
1703
- "epoch": 44.285714285714285,
1704
- "grad_norm": 0.328795850276947,
1705
- "learning_rate": 6.349206349206349e-06,
1706
- "loss": 0.0328,
1707
- "step": 1860
1708
- },
1709
- {
1710
- "epoch": 44.523809523809526,
1711
- "grad_norm": 0.7609361410140991,
1712
- "learning_rate": 6.0846560846560845e-06,
1713
- "loss": 0.0383,
1714
- "step": 1870
1715
- },
1716
- {
1717
- "epoch": 44.76190476190476,
1718
- "grad_norm": 0.4592248201370239,
1719
- "learning_rate": 5.82010582010582e-06,
1720
- "loss": 0.0248,
1721
- "step": 1880
1722
- },
1723
- {
1724
- "epoch": 45.0,
1725
- "grad_norm": 0.208053857088089,
1726
- "learning_rate": 5.555555555555556e-06,
1727
- "loss": 0.0233,
1728
- "step": 1890
1729
- },
1730
- {
1731
- "epoch": 45.0,
1732
- "eval_accuracy": 0.9731993299832495,
1733
- "eval_loss": 0.1315459907054901,
1734
- "eval_runtime": 105.7748,
1735
- "eval_samples_per_second": 5.644,
1736
- "eval_steps_per_second": 0.18,
1737
- "step": 1890
1738
- },
1739
- {
1740
- "epoch": 45.23809523809524,
1741
- "grad_norm": 0.4980098605155945,
1742
- "learning_rate": 5.291005291005291e-06,
1743
- "loss": 0.0209,
1744
- "step": 1900
1745
  },
1746
  {
1747
- "epoch": 45.476190476190474,
1748
- "grad_norm": 1.4569426774978638,
1749
- "learning_rate": 5.026455026455026e-06,
1750
- "loss": 0.0249,
1751
- "step": 1910
 
 
1752
  },
1753
  {
1754
- "epoch": 45.714285714285715,
1755
- "grad_norm": 0.21140669286251068,
1756
- "learning_rate": 4.7619047619047615e-06,
1757
- "loss": 0.0274,
1758
- "step": 1920
 
 
1759
  },
1760
  {
1761
- "epoch": 45.95238095238095,
1762
- "grad_norm": 0.9561221599578857,
1763
- "learning_rate": 4.497354497354498e-06,
1764
- "loss": 0.042,
1765
- "step": 1930
1766
  },
1767
  {
1768
- "epoch": 46.0,
1769
- "eval_accuracy": 0.9731993299832495,
1770
- "eval_loss": 0.1261177957057953,
1771
- "eval_runtime": 106.0727,
1772
- "eval_samples_per_second": 5.628,
1773
- "eval_steps_per_second": 0.179,
1774
- "step": 1932
1775
  },
1776
  {
1777
- "epoch": 46.19047619047619,
1778
- "grad_norm": 1.0604710578918457,
1779
- "learning_rate": 4.232804232804233e-06,
1780
- "loss": 0.022,
1781
- "step": 1940
1782
  },
1783
  {
1784
- "epoch": 46.42857142857143,
1785
- "grad_norm": 2.230454683303833,
1786
- "learning_rate": 3.968253968253968e-06,
1787
- "loss": 0.0351,
1788
- "step": 1950
 
 
1789
  },
1790
  {
1791
- "epoch": 46.666666666666664,
1792
- "grad_norm": 0.6946862936019897,
1793
  "learning_rate": 3.7037037037037037e-06,
1794
- "loss": 0.0301,
1795
- "step": 1960
1796
- },
1797
- {
1798
- "epoch": 46.904761904761905,
1799
- "grad_norm": 0.760875403881073,
1800
- "learning_rate": 3.439153439153439e-06,
1801
- "loss": 0.0251,
1802
- "step": 1970
1803
- },
1804
- {
1805
- "epoch": 47.0,
1806
- "eval_accuracy": 0.9731993299832495,
1807
- "eval_loss": 0.13198845088481903,
1808
- "eval_runtime": 107.949,
1809
- "eval_samples_per_second": 5.53,
1810
- "eval_steps_per_second": 0.176,
1811
- "step": 1974
1812
- },
1813
- {
1814
- "epoch": 47.142857142857146,
1815
- "grad_norm": 0.19793380796909332,
1816
- "learning_rate": 3.1746031746031746e-06,
1817
- "loss": 0.0194,
1818
- "step": 1980
1819
- },
1820
- {
1821
- "epoch": 47.38095238095238,
1822
- "grad_norm": 1.171229362487793,
1823
- "learning_rate": 2.91005291005291e-06,
1824
- "loss": 0.0207,
1825
- "step": 1990
1826
- },
1827
- {
1828
- "epoch": 47.61904761904762,
1829
- "grad_norm": 1.4025137424468994,
1830
- "learning_rate": 2.6455026455026455e-06,
1831
- "loss": 0.0264,
1832
- "step": 2000
1833
- },
1834
- {
1835
- "epoch": 47.857142857142854,
1836
- "grad_norm": 0.5469716191291809,
1837
- "learning_rate": 2.3809523809523808e-06,
1838
- "loss": 0.041,
1839
- "step": 2010
1840
- },
1841
- {
1842
- "epoch": 48.0,
1843
- "eval_accuracy": 0.9731993299832495,
1844
- "eval_loss": 0.12820355594158173,
1845
- "eval_runtime": 107.9105,
1846
- "eval_samples_per_second": 5.532,
1847
- "eval_steps_per_second": 0.176,
1848
- "step": 2016
1849
- },
1850
- {
1851
- "epoch": 48.095238095238095,
1852
- "grad_norm": 0.5962417125701904,
1853
- "learning_rate": 2.1164021164021164e-06,
1854
- "loss": 0.0257,
1855
- "step": 2020
1856
- },
1857
- {
1858
- "epoch": 48.333333333333336,
1859
- "grad_norm": 1.9686793088912964,
1860
- "learning_rate": 1.8518518518518519e-06,
1861
- "loss": 0.0297,
1862
- "step": 2030
1863
- },
1864
- {
1865
- "epoch": 48.57142857142857,
1866
- "grad_norm": 1.7944121360778809,
1867
- "learning_rate": 1.5873015873015873e-06,
1868
- "loss": 0.0272,
1869
- "step": 2040
1870
  },
1871
  {
1872
- "epoch": 48.80952380952381,
1873
- "grad_norm": 0.4869058430194855,
1874
- "learning_rate": 1.3227513227513228e-06,
1875
- "loss": 0.0445,
1876
- "step": 2050
 
 
 
 
 
 
 
 
 
1877
  },
1878
  {
1879
- "epoch": 49.0,
1880
- "eval_accuracy": 0.9731993299832495,
1881
- "eval_loss": 0.12962684035301208,
1882
- "eval_runtime": 107.6607,
1883
- "eval_samples_per_second": 5.545,
1884
- "eval_steps_per_second": 0.176,
1885
- "step": 2058
1886
  },
1887
  {
1888
- "epoch": 49.04761904761905,
1889
- "grad_norm": 0.599247932434082,
1890
- "learning_rate": 1.0582010582010582e-06,
1891
- "loss": 0.0289,
1892
- "step": 2060
1893
  },
1894
  {
1895
- "epoch": 49.285714285714285,
1896
- "grad_norm": 1.3819619417190552,
1897
- "learning_rate": 7.936507936507937e-07,
1898
- "loss": 0.0236,
1899
- "step": 2070
 
 
1900
  },
1901
  {
1902
- "epoch": 49.523809523809526,
1903
- "grad_norm": 0.9784670472145081,
1904
- "learning_rate": 5.291005291005291e-07,
1905
- "loss": 0.0306,
1906
- "step": 2080
1907
  },
1908
  {
1909
- "epoch": 49.76190476190476,
1910
- "grad_norm": 0.8563596606254578,
1911
- "learning_rate": 2.6455026455026455e-07,
1912
- "loss": 0.0212,
1913
- "step": 2090
 
 
1914
  },
1915
  {
1916
- "epoch": 50.0,
1917
- "grad_norm": 1.10076904296875,
1918
- "learning_rate": 0.0,
1919
- "loss": 0.0308,
1920
- "step": 2100
1921
- },
1922
- {
1923
- "epoch": 50.0,
1924
- "eval_accuracy": 0.9731993299832495,
1925
- "eval_loss": 0.13253676891326904,
1926
- "eval_runtime": 110.4625,
1927
- "eval_samples_per_second": 5.405,
1928
- "eval_steps_per_second": 0.172,
1929
- "step": 2100
1930
- },
1931
- {
1932
- "epoch": 50.0,
1933
- "step": 2100,
1934
- "total_flos": 2.0803097508518707e+19,
1935
- "train_loss": 0.1354110169055916,
1936
- "train_runtime": 52600.7464,
1937
- "train_samples_per_second": 5.104,
1938
- "train_steps_per_second": 0.04
1939
  }
1940
  ],
1941
  "logging_steps": 10,
1942
- "max_steps": 2100,
1943
  "num_input_tokens_seen": 0,
1944
  "num_train_epochs": 50,
1945
  "save_steps": 500,
@@ -1955,7 +773,7 @@
1955
  "attributes": {}
1956
  }
1957
  },
1958
- "total_flos": 2.0803097508518707e+19,
1959
  "train_batch_size": 32,
1960
  "trial_name": null,
1961
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.9338235294117647,
3
+ "best_model_checkpoint": "vit-base-patch16-224-in21k-finetuned-papsmear/checkpoint-419",
4
+ "epoch": 46.15384615384615,
5
  "eval_steps": 500,
6
+ "global_step": 450,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.9230769230769231,
13
+ "eval_accuracy": 0.2647058823529412,
14
+ "eval_loss": 1.7345659732818604,
15
+ "eval_runtime": 30.9498,
16
+ "eval_samples_per_second": 4.394,
17
+ "eval_steps_per_second": 0.162,
18
+ "step": 9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  },
20
  {
21
+ "epoch": 1.0256410256410255,
22
+ "grad_norm": 1.5590156316757202,
23
+ "learning_rate": 1.1111111111111112e-05,
24
+ "loss": 1.7645,
25
+ "step": 10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  },
27
  {
28
+ "epoch": 1.9487179487179487,
29
+ "eval_accuracy": 0.3088235294117647,
30
+ "eval_loss": 1.6151821613311768,
31
+ "eval_runtime": 30.7387,
32
+ "eval_samples_per_second": 4.424,
33
+ "eval_steps_per_second": 0.163,
34
+ "step": 19
35
  },
36
  {
37
+ "epoch": 2.051282051282051,
38
+ "grad_norm": 1.000002384185791,
39
+ "learning_rate": 2.2222222222222223e-05,
40
+ "loss": 1.661,
41
+ "step": 20
 
 
42
  },
43
  {
44
+ "epoch": 2.9743589743589745,
45
+ "eval_accuracy": 0.4117647058823529,
46
+ "eval_loss": 1.4663141965866089,
47
+ "eval_runtime": 30.7848,
48
+ "eval_samples_per_second": 4.418,
49
+ "eval_steps_per_second": 0.162,
50
+ "step": 29
51
  },
52
  {
53
+ "epoch": 3.076923076923077,
54
+ "grad_norm": 1.144618272781372,
55
  "learning_rate": 3.3333333333333335e-05,
56
+ "loss": 1.496,
57
+ "step": 30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  },
59
  {
60
  "epoch": 4.0,
61
+ "eval_accuracy": 0.4852941176470588,
62
+ "eval_loss": 1.2988938093185425,
63
+ "eval_runtime": 29.3106,
64
+ "eval_samples_per_second": 4.64,
65
+ "eval_steps_per_second": 0.171,
66
+ "step": 39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  },
68
  {
69
+ "epoch": 4.102564102564102,
70
+ "grad_norm": 0.9698243141174316,
71
+ "learning_rate": 4.4444444444444447e-05,
72
+ "loss": 1.3097,
73
+ "step": 40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  },
75
  {
76
+ "epoch": 4.923076923076923,
77
+ "eval_accuracy": 0.5588235294117647,
78
+ "eval_loss": 1.1490617990493774,
79
+ "eval_runtime": 29.781,
80
+ "eval_samples_per_second": 4.567,
81
+ "eval_steps_per_second": 0.168,
82
+ "step": 48
83
  },
84
  {
85
+ "epoch": 5.128205128205128,
86
+ "grad_norm": 1.00479257106781,
87
+ "learning_rate": 4.938271604938271e-05,
88
+ "loss": 1.091,
89
+ "step": 50
90
  },
91
  {
92
+ "epoch": 5.948717948717949,
93
+ "eval_accuracy": 0.7205882352941176,
94
+ "eval_loss": 0.9932733178138733,
95
+ "eval_runtime": 30.4494,
96
+ "eval_samples_per_second": 4.466,
97
+ "eval_steps_per_second": 0.164,
98
+ "step": 58
99
  },
100
  {
101
+ "epoch": 6.153846153846154,
102
+ "grad_norm": 1.4680265188217163,
103
  "learning_rate": 4.814814814814815e-05,
104
+ "loss": 0.9088,
105
+ "step": 60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  },
107
  {
108
+ "epoch": 6.9743589743589745,
109
+ "eval_accuracy": 0.6985294117647058,
110
+ "eval_loss": 0.9170573353767395,
111
+ "eval_runtime": 29.5689,
112
+ "eval_samples_per_second": 4.599,
113
+ "eval_steps_per_second": 0.169,
114
+ "step": 68
115
  },
116
  {
117
+ "epoch": 7.17948717948718,
118
+ "grad_norm": 1.2828285694122314,
119
+ "learning_rate": 4.691358024691358e-05,
120
+ "loss": 0.7858,
121
+ "step": 70
122
  },
123
  {
124
  "epoch": 8.0,
125
+ "eval_accuracy": 0.7720588235294118,
126
+ "eval_loss": 0.8300582766532898,
127
+ "eval_runtime": 30.9278,
128
+ "eval_samples_per_second": 4.397,
129
+ "eval_steps_per_second": 0.162,
130
+ "step": 78
131
+ },
132
+ {
133
+ "epoch": 8.205128205128204,
134
+ "grad_norm": 1.7692698240280151,
135
+ "learning_rate": 4.567901234567901e-05,
136
+ "loss": 0.7016,
137
+ "step": 80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  },
139
  {
140
+ "epoch": 8.923076923076923,
141
+ "eval_accuracy": 0.7352941176470589,
142
+ "eval_loss": 0.7925190925598145,
143
+ "eval_runtime": 32.3375,
144
+ "eval_samples_per_second": 4.206,
145
+ "eval_steps_per_second": 0.155,
146
+ "step": 87
147
  },
148
  {
149
+ "epoch": 9.23076923076923,
150
+ "grad_norm": 1.7534127235412598,
151
  "learning_rate": 4.4444444444444447e-05,
152
+ "loss": 0.6136,
153
+ "step": 90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  },
155
  {
156
+ "epoch": 9.948717948717949,
157
+ "eval_accuracy": 0.7647058823529411,
158
+ "eval_loss": 0.6992273330688477,
159
+ "eval_runtime": 30.502,
160
+ "eval_samples_per_second": 4.459,
161
+ "eval_steps_per_second": 0.164,
162
+ "step": 97
163
  },
164
  {
165
+ "epoch": 10.256410256410255,
166
+ "grad_norm": 1.2740432024002075,
167
+ "learning_rate": 4.3209876543209875e-05,
168
+ "loss": 0.532,
169
+ "step": 100
170
  },
171
  {
172
+ "epoch": 10.974358974358974,
173
+ "eval_accuracy": 0.8308823529411765,
174
+ "eval_loss": 0.6400743126869202,
175
+ "eval_runtime": 31.9094,
176
+ "eval_samples_per_second": 4.262,
177
+ "eval_steps_per_second": 0.157,
178
+ "step": 107
179
  },
180
  {
181
+ "epoch": 11.282051282051283,
182
+ "grad_norm": 1.563349723815918,
183
+ "learning_rate": 4.197530864197531e-05,
184
+ "loss": 0.5018,
185
+ "step": 110
186
  },
187
  {
188
  "epoch": 12.0,
189
+ "eval_accuracy": 0.8382352941176471,
190
+ "eval_loss": 0.5786880254745483,
191
+ "eval_runtime": 29.8171,
192
+ "eval_samples_per_second": 4.561,
193
+ "eval_steps_per_second": 0.168,
194
+ "step": 117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  },
196
  {
197
+ "epoch": 12.307692307692308,
198
+ "grad_norm": 2.5129995346069336,
 
 
 
 
 
 
 
199
  "learning_rate": 4.074074074074074e-05,
200
+ "loss": 0.4279,
201
+ "step": 120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  },
203
  {
204
+ "epoch": 12.923076923076923,
205
+ "eval_accuracy": 0.8088235294117647,
206
+ "eval_loss": 0.6129594445228577,
207
+ "eval_runtime": 29.826,
208
+ "eval_samples_per_second": 4.56,
209
+ "eval_steps_per_second": 0.168,
210
+ "step": 126
211
  },
212
  {
213
+ "epoch": 13.333333333333334,
214
+ "grad_norm": 1.6143475770950317,
215
+ "learning_rate": 3.950617283950617e-05,
216
+ "loss": 0.4116,
217
+ "step": 130
218
  },
219
  {
220
+ "epoch": 13.948717948717949,
221
+ "eval_accuracy": 0.8382352941176471,
222
+ "eval_loss": 0.5089898109436035,
223
+ "eval_runtime": 29.9983,
224
+ "eval_samples_per_second": 4.534,
225
+ "eval_steps_per_second": 0.167,
226
+ "step": 136
227
  },
228
  {
229
+ "epoch": 14.35897435897436,
230
+ "grad_norm": 1.6241114139556885,
231
+ "learning_rate": 3.82716049382716e-05,
232
+ "loss": 0.3848,
233
+ "step": 140
234
  },
235
  {
236
+ "epoch": 14.974358974358974,
237
+ "eval_accuracy": 0.8676470588235294,
238
+ "eval_loss": 0.5165212154388428,
239
+ "eval_runtime": 30.2024,
240
+ "eval_samples_per_second": 4.503,
241
+ "eval_steps_per_second": 0.166,
242
+ "step": 146
243
  },
244
  {
245
+ "epoch": 15.384615384615385,
246
+ "grad_norm": 2.7321174144744873,
247
  "learning_rate": 3.7037037037037037e-05,
248
+ "loss": 0.3449,
249
+ "step": 150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  },
251
  {
252
+ "epoch": 16.0,
253
+ "eval_accuracy": 0.8382352941176471,
254
+ "eval_loss": 0.4842991530895233,
255
+ "eval_runtime": 29.5578,
256
+ "eval_samples_per_second": 4.601,
257
+ "eval_steps_per_second": 0.169,
258
+ "step": 156
259
+ },
260
+ {
261
+ "epoch": 16.41025641025641,
262
+ "grad_norm": 1.7041429281234741,
263
+ "learning_rate": 3.580246913580247e-05,
264
+ "loss": 0.3008,
265
+ "step": 160
266
  },
267
  {
268
+ "epoch": 16.923076923076923,
269
+ "eval_accuracy": 0.8455882352941176,
270
+ "eval_loss": 0.5460208058357239,
271
+ "eval_runtime": 29.7055,
272
+ "eval_samples_per_second": 4.578,
273
+ "eval_steps_per_second": 0.168,
274
+ "step": 165
275
  },
276
  {
277
+ "epoch": 17.435897435897434,
278
+ "grad_norm": 2.1159939765930176,
279
+ "learning_rate": 3.45679012345679e-05,
280
+ "loss": 0.2797,
281
+ "step": 170
282
  },
283
  {
284
+ "epoch": 17.94871794871795,
285
+ "eval_accuracy": 0.8308823529411765,
286
+ "eval_loss": 0.4984971880912781,
287
+ "eval_runtime": 29.7303,
288
+ "eval_samples_per_second": 4.574,
289
+ "eval_steps_per_second": 0.168,
290
+ "step": 175
291
  },
292
  {
293
+ "epoch": 18.46153846153846,
294
+ "grad_norm": 1.050848126411438,
295
  "learning_rate": 3.3333333333333335e-05,
296
+ "loss": 0.2696,
297
+ "step": 180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
  },
299
  {
300
+ "epoch": 18.974358974358974,
301
+ "eval_accuracy": 0.8455882352941176,
302
+ "eval_loss": 0.5585992336273193,
303
+ "eval_runtime": 29.5612,
304
+ "eval_samples_per_second": 4.601,
305
+ "eval_steps_per_second": 0.169,
306
+ "step": 185
307
  },
308
  {
309
+ "epoch": 19.487179487179485,
310
+ "grad_norm": 2.385378360748291,
311
+ "learning_rate": 3.209876543209876e-05,
312
+ "loss": 0.2633,
313
+ "step": 190
314
  },
315
  {
316
+ "epoch": 20.0,
317
+ "eval_accuracy": 0.9044117647058824,
318
+ "eval_loss": 0.43493831157684326,
319
+ "eval_runtime": 29.6728,
320
+ "eval_samples_per_second": 4.583,
321
+ "eval_steps_per_second": 0.169,
322
+ "step": 195
323
+ },
324
+ {
325
+ "epoch": 20.51282051282051,
326
+ "grad_norm": 1.972583293914795,
327
+ "learning_rate": 3.08641975308642e-05,
328
+ "loss": 0.2569,
329
+ "step": 200
330
  },
331
  {
332
+ "epoch": 20.923076923076923,
333
+ "eval_accuracy": 0.8897058823529411,
334
+ "eval_loss": 0.4017449617385864,
335
+ "eval_runtime": 31.5372,
336
+ "eval_samples_per_second": 4.312,
337
+ "eval_steps_per_second": 0.159,
338
+ "step": 204
339
  },
340
  {
341
+ "epoch": 21.53846153846154,
342
+ "grad_norm": 2.712113380432129,
343
  "learning_rate": 2.962962962962963e-05,
344
+ "loss": 0.27,
345
+ "step": 210
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
  },
347
  {
348
+ "epoch": 21.94871794871795,
349
+ "eval_accuracy": 0.8602941176470589,
350
+ "eval_loss": 0.4758412837982178,
351
+ "eval_runtime": 30.0407,
352
+ "eval_samples_per_second": 4.527,
353
+ "eval_steps_per_second": 0.166,
354
+ "step": 214
355
  },
356
  {
357
+ "epoch": 22.564102564102566,
358
+ "grad_norm": 1.7184677124023438,
359
+ "learning_rate": 2.839506172839506e-05,
360
+ "loss": 0.2706,
361
+ "step": 220
362
  },
363
  {
364
+ "epoch": 22.974358974358974,
365
+ "eval_accuracy": 0.8897058823529411,
366
+ "eval_loss": 0.41326794028282166,
367
+ "eval_runtime": 29.7459,
368
+ "eval_samples_per_second": 4.572,
369
+ "eval_steps_per_second": 0.168,
370
+ "step": 224
371
  },
372
  {
373
+ "epoch": 23.58974358974359,
374
+ "grad_norm": 1.8792766332626343,
375
+ "learning_rate": 2.7160493827160493e-05,
376
+ "loss": 0.2211,
377
+ "step": 230
378
  },
379
  {
380
+ "epoch": 24.0,
381
+ "eval_accuracy": 0.9117647058823529,
382
+ "eval_loss": 0.3844151794910431,
383
+ "eval_runtime": 29.8072,
384
+ "eval_samples_per_second": 4.563,
385
+ "eval_steps_per_second": 0.168,
386
+ "step": 234
387
  },
388
  {
389
+ "epoch": 24.615384615384617,
390
+ "grad_norm": 2.2763214111328125,
391
  "learning_rate": 2.5925925925925925e-05,
392
+ "loss": 0.1977,
393
+ "step": 240
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
  },
395
  {
396
+ "epoch": 24.923076923076923,
397
+ "eval_accuracy": 0.9264705882352942,
398
+ "eval_loss": 0.34974199533462524,
399
+ "eval_runtime": 29.6847,
400
+ "eval_samples_per_second": 4.581,
401
+ "eval_steps_per_second": 0.168,
402
+ "step": 243
403
  },
404
  {
405
+ "epoch": 25.641025641025642,
406
+ "grad_norm": 4.190616130828857,
407
+ "learning_rate": 2.4691358024691357e-05,
408
+ "loss": 0.1969,
409
+ "step": 250
410
  },
411
  {
412
+ "epoch": 25.94871794871795,
413
+ "eval_accuracy": 0.9044117647058824,
414
+ "eval_loss": 0.37360796332359314,
415
+ "eval_runtime": 31.1254,
416
+ "eval_samples_per_second": 4.369,
417
+ "eval_steps_per_second": 0.161,
418
+ "step": 253
419
  },
420
  {
421
+ "epoch": 26.666666666666668,
422
+ "grad_norm": 0.6893692016601562,
423
+ "learning_rate": 2.345679012345679e-05,
424
+ "loss": 0.1776,
425
+ "step": 260
426
  },
427
  {
428
+ "epoch": 26.974358974358974,
429
+ "eval_accuracy": 0.9044117647058824,
430
+ "eval_loss": 0.3796656131744385,
431
+ "eval_runtime": 29.8455,
432
+ "eval_samples_per_second": 4.557,
433
+ "eval_steps_per_second": 0.168,
434
+ "step": 263
435
  },
436
  {
437
+ "epoch": 27.692307692307693,
438
+ "grad_norm": 2.18731427192688,
439
  "learning_rate": 2.2222222222222223e-05,
440
+ "loss": 0.1787,
441
+ "step": 270
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
442
  },
443
  {
444
+ "epoch": 28.0,
445
+ "eval_accuracy": 0.8897058823529411,
446
+ "eval_loss": 0.39490777254104614,
447
+ "eval_runtime": 29.2795,
448
+ "eval_samples_per_second": 4.645,
449
+ "eval_steps_per_second": 0.171,
450
+ "step": 273
451
+ },
452
+ {
453
+ "epoch": 28.71794871794872,
454
+ "grad_norm": 1.656586766242981,
455
+ "learning_rate": 2.0987654320987655e-05,
456
+ "loss": 0.18,
457
+ "step": 280
458
  },
459
  {
460
+ "epoch": 28.923076923076923,
461
+ "eval_accuracy": 0.9264705882352942,
462
+ "eval_loss": 0.32775887846946716,
463
+ "eval_runtime": 29.3934,
464
+ "eval_samples_per_second": 4.627,
465
+ "eval_steps_per_second": 0.17,
466
+ "step": 282
467
  },
468
  {
469
+ "epoch": 29.743589743589745,
470
+ "grad_norm": 1.3221951723098755,
471
+ "learning_rate": 1.9753086419753087e-05,
472
+ "loss": 0.1797,
473
+ "step": 290
 
 
474
  },
475
  {
476
+ "epoch": 29.94871794871795,
477
+ "eval_accuracy": 0.9044117647058824,
478
+ "eval_loss": 0.36148715019226074,
479
+ "eval_runtime": 29.46,
480
+ "eval_samples_per_second": 4.616,
481
+ "eval_steps_per_second": 0.17,
482
+ "step": 292
483
  },
484
  {
485
+ "epoch": 30.76923076923077,
486
+ "grad_norm": 1.227844476699829,
487
  "learning_rate": 1.8518518518518518e-05,
488
+ "loss": 0.1665,
489
+ "step": 300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
490
  },
491
  {
492
+ "epoch": 30.974358974358974,
493
+ "eval_accuracy": 0.8602941176470589,
494
+ "eval_loss": 0.4174344539642334,
495
+ "eval_runtime": 29.6798,
496
+ "eval_samples_per_second": 4.582,
497
+ "eval_steps_per_second": 0.168,
498
+ "step": 302
499
  },
500
  {
501
+ "epoch": 31.794871794871796,
502
+ "grad_norm": 2.3732173442840576,
503
+ "learning_rate": 1.728395061728395e-05,
504
+ "loss": 0.163,
505
+ "step": 310
 
 
506
  },
507
  {
508
+ "epoch": 32.0,
509
+ "eval_accuracy": 0.8970588235294118,
510
+ "eval_loss": 0.3574081063270569,
511
+ "eval_runtime": 29.8114,
512
+ "eval_samples_per_second": 4.562,
513
+ "eval_steps_per_second": 0.168,
514
+ "step": 312
515
+ },
516
+ {
517
+ "epoch": 32.82051282051282,
518
+ "grad_norm": 2.783026933670044,
519
+ "learning_rate": 1.604938271604938e-05,
520
+ "loss": 0.1498,
521
+ "step": 320
522
  },
523
  {
524
+ "epoch": 32.92307692307692,
525
+ "eval_accuracy": 0.9044117647058824,
526
+ "eval_loss": 0.35905760526657104,
527
+ "eval_runtime": 30.5584,
528
+ "eval_samples_per_second": 4.45,
529
+ "eval_steps_per_second": 0.164,
530
+ "step": 321
531
  },
532
  {
533
+ "epoch": 33.84615384615385,
534
+ "grad_norm": 2.496835231781006,
535
  "learning_rate": 1.4814814814814815e-05,
536
+ "loss": 0.1405,
537
+ "step": 330
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
538
  },
539
  {
540
+ "epoch": 33.94871794871795,
541
+ "eval_accuracy": 0.9191176470588235,
542
+ "eval_loss": 0.30173459649086,
543
+ "eval_runtime": 30.2786,
544
+ "eval_samples_per_second": 4.492,
545
+ "eval_steps_per_second": 0.165,
546
+ "step": 331
547
  },
548
  {
549
+ "epoch": 34.87179487179487,
550
+ "grad_norm": 2.3252382278442383,
551
+ "learning_rate": 1.3580246913580247e-05,
552
+ "loss": 0.155,
553
+ "step": 340
554
  },
555
  {
556
+ "epoch": 34.97435897435897,
557
+ "eval_accuracy": 0.9264705882352942,
558
+ "eval_loss": 0.3303259611129761,
559
+ "eval_runtime": 29.5808,
560
+ "eval_samples_per_second": 4.598,
561
+ "eval_steps_per_second": 0.169,
562
+ "step": 341
563
  },
564
  {
565
+ "epoch": 35.8974358974359,
566
+ "grad_norm": 0.8514438271522522,
567
+ "learning_rate": 1.2345679012345678e-05,
568
+ "loss": 0.1519,
569
+ "step": 350
570
  },
571
  {
572
+ "epoch": 36.0,
573
+ "eval_accuracy": 0.8970588235294118,
574
+ "eval_loss": 0.3559113144874573,
575
+ "eval_runtime": 30.264,
576
+ "eval_samples_per_second": 4.494,
577
+ "eval_steps_per_second": 0.165,
578
+ "step": 351
579
  },
580
  {
581
+ "epoch": 36.92307692307692,
582
+ "grad_norm": 0.9986769556999207,
583
  "learning_rate": 1.1111111111111112e-05,
584
+ "loss": 0.1415,
585
+ "step": 360
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
586
  },
587
  {
588
+ "epoch": 36.92307692307692,
589
+ "eval_accuracy": 0.9191176470588235,
590
+ "eval_loss": 0.2890462279319763,
591
+ "eval_runtime": 30.7844,
592
+ "eval_samples_per_second": 4.418,
593
+ "eval_steps_per_second": 0.162,
594
+ "step": 360
595
  },
596
  {
597
+ "epoch": 37.94871794871795,
598
+ "grad_norm": 0.9639208912849426,
599
+ "learning_rate": 9.876543209876543e-06,
600
+ "loss": 0.1256,
601
+ "step": 370
602
  },
603
  {
604
+ "epoch": 37.94871794871795,
605
+ "eval_accuracy": 0.8897058823529411,
606
+ "eval_loss": 0.3445207476615906,
607
+ "eval_runtime": 29.9807,
608
+ "eval_samples_per_second": 4.536,
609
+ "eval_steps_per_second": 0.167,
610
+ "step": 370
611
  },
612
  {
613
+ "epoch": 38.97435897435897,
614
+ "grad_norm": 0.9389523267745972,
615
+ "learning_rate": 8.641975308641975e-06,
616
+ "loss": 0.1217,
617
+ "step": 380
 
 
618
  },
619
  {
620
+ "epoch": 38.97435897435897,
621
+ "eval_accuracy": 0.9117647058823529,
622
+ "eval_loss": 0.3434993028640747,
623
+ "eval_runtime": 30.2833,
624
+ "eval_samples_per_second": 4.491,
625
+ "eval_steps_per_second": 0.165,
626
+ "step": 380
627
  },
628
  {
629
+ "epoch": 40.0,
630
+ "grad_norm": 7.135587215423584,
631
  "learning_rate": 7.4074074074074075e-06,
632
+ "loss": 0.1285,
633
+ "step": 390
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
634
  },
635
  {
636
+ "epoch": 40.0,
637
+ "eval_accuracy": 0.9191176470588235,
638
+ "eval_loss": 0.30249181389808655,
639
+ "eval_runtime": 29.8301,
640
+ "eval_samples_per_second": 4.559,
641
+ "eval_steps_per_second": 0.168,
642
+ "step": 390
643
  },
644
  {
645
+ "epoch": 40.92307692307692,
646
+ "eval_accuracy": 0.8823529411764706,
647
+ "eval_loss": 0.36019423604011536,
648
+ "eval_runtime": 29.9271,
649
+ "eval_samples_per_second": 4.544,
650
+ "eval_steps_per_second": 0.167,
651
+ "step": 399
652
  },
653
  {
654
+ "epoch": 41.02564102564103,
655
+ "grad_norm": 0.5793786644935608,
656
+ "learning_rate": 6.172839506172839e-06,
657
+ "loss": 0.1301,
658
+ "step": 400
659
  },
660
  {
661
+ "epoch": 41.94871794871795,
662
+ "eval_accuracy": 0.8897058823529411,
663
+ "eval_loss": 0.3336350917816162,
664
+ "eval_runtime": 32.0768,
665
+ "eval_samples_per_second": 4.24,
666
+ "eval_steps_per_second": 0.156,
667
+ "step": 409
668
  },
669
  {
670
+ "epoch": 42.05128205128205,
671
+ "grad_norm": 2.5887136459350586,
672
+ "learning_rate": 4.938271604938272e-06,
673
+ "loss": 0.1243,
674
+ "step": 410
675
  },
676
  {
677
+ "epoch": 42.97435897435897,
678
+ "eval_accuracy": 0.9338235294117647,
679
+ "eval_loss": 0.28249993920326233,
680
+ "eval_runtime": 29.95,
681
+ "eval_samples_per_second": 4.541,
682
+ "eval_steps_per_second": 0.167,
683
+ "step": 419
684
  },
685
  {
686
+ "epoch": 43.07692307692308,
687
+ "grad_norm": 1.4639347791671753,
688
  "learning_rate": 3.7037037037037037e-06,
689
+ "loss": 0.1191,
690
+ "step": 420
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
691
  },
692
  {
693
+ "epoch": 44.0,
694
+ "eval_accuracy": 0.9264705882352942,
695
+ "eval_loss": 0.28346800804138184,
696
+ "eval_runtime": 32.4599,
697
+ "eval_samples_per_second": 4.19,
698
+ "eval_steps_per_second": 0.154,
699
+ "step": 429
700
+ },
701
+ {
702
+ "epoch": 44.1025641025641,
703
+ "grad_norm": 1.6088590621948242,
704
+ "learning_rate": 2.469135802469136e-06,
705
+ "loss": 0.1221,
706
+ "step": 430
707
  },
708
  {
709
+ "epoch": 44.92307692307692,
710
+ "eval_accuracy": 0.9191176470588235,
711
+ "eval_loss": 0.2723533809185028,
712
+ "eval_runtime": 30.5746,
713
+ "eval_samples_per_second": 4.448,
714
+ "eval_steps_per_second": 0.164,
715
+ "step": 438
716
  },
717
  {
718
+ "epoch": 45.12820512820513,
719
+ "grad_norm": 0.848240077495575,
720
+ "learning_rate": 1.234567901234568e-06,
721
+ "loss": 0.1151,
722
+ "step": 440
723
  },
724
  {
725
+ "epoch": 45.94871794871795,
726
+ "eval_accuracy": 0.9191176470588235,
727
+ "eval_loss": 0.27075088024139404,
728
+ "eval_runtime": 30.0665,
729
+ "eval_samples_per_second": 4.523,
730
+ "eval_steps_per_second": 0.166,
731
+ "step": 448
732
  },
733
  {
734
+ "epoch": 46.15384615384615,
735
+ "grad_norm": 3.371528148651123,
736
+ "learning_rate": 0.0,
737
+ "loss": 0.1195,
738
+ "step": 450
739
  },
740
  {
741
+ "epoch": 46.15384615384615,
742
+ "eval_accuracy": 0.9191176470588235,
743
+ "eval_loss": 0.2707464396953583,
744
+ "eval_runtime": 33.0815,
745
+ "eval_samples_per_second": 4.111,
746
+ "eval_steps_per_second": 0.151,
747
+ "step": 450
748
  },
749
  {
750
+ "epoch": 46.15384615384615,
751
+ "step": 450,
752
+ "total_flos": 4.3781443993328026e+18,
753
+ "train_loss": 0.4078153912226359,
754
+ "train_runtime": 14896.2203,
755
+ "train_samples_per_second": 4.108,
756
+ "train_steps_per_second": 0.03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
757
  }
758
  ],
759
  "logging_steps": 10,
760
+ "max_steps": 450,
761
  "num_input_tokens_seen": 0,
762
  "num_train_epochs": 50,
763
  "save_steps": 500,
 
773
  "attributes": {}
774
  }
775
  },
776
+ "total_flos": 4.3781443993328026e+18,
777
  "train_batch_size": 32,
778
  "trial_name": null,
779
  "trial_params": null