wcosmas commited on
Commit
b327fc5
1 Parent(s): bdf8fc6

End of training

Browse files
Files changed (5) hide show
  1. README.md +1 -1
  2. all_results.json +11 -11
  3. eval_results.json +6 -6
  4. train_results.json +6 -6
  5. trainer_state.json +564 -894
README.md CHANGED
@@ -33,7 +33,7 @@ should probably proofread and complete it, then remove this comment. -->
33
 
34
  This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the imagefolder dataset.
35
  It achieves the following results on the evaluation set:
36
- - Loss: 0.2870
37
  - Accuracy: 0.9338
38
 
39
  ## Model description
 
33
 
34
  This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the imagefolder dataset.
35
  It achieves the following results on the evaluation set:
36
+ - Loss: 0.3311
37
  - Accuracy: 0.9338
38
 
39
  ## Model description
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 29.80392156862745,
3
- "eval_accuracy": 0.9411764705882353,
4
- "eval_loss": 0.2523283064365387,
5
- "eval_runtime": 49.5509,
6
- "eval_samples_per_second": 2.745,
7
- "eval_steps_per_second": 0.343,
8
- "total_flos": 2.827009127861453e+18,
9
- "train_loss": 0.3311853929046999,
10
- "train_runtime": 10131.822,
11
- "train_samples_per_second": 3.624,
12
- "train_steps_per_second": 0.113
13
  }
 
1
  {
2
+ "epoch": 46.15384615384615,
3
+ "eval_accuracy": 0.9338235294117647,
4
+ "eval_loss": 0.33108076453208923,
5
+ "eval_runtime": 29.2681,
6
+ "eval_samples_per_second": 4.647,
7
+ "eval_steps_per_second": 0.171,
8
+ "total_flos": 4.3781443993328026e+18,
9
+ "train_loss": 0.41361336602105037,
10
+ "train_runtime": 14457.8356,
11
+ "train_samples_per_second": 4.233,
12
+ "train_steps_per_second": 0.031
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 29.80392156862745,
3
- "eval_accuracy": 0.9411764705882353,
4
- "eval_loss": 0.2523283064365387,
5
- "eval_runtime": 49.5509,
6
- "eval_samples_per_second": 2.745,
7
- "eval_steps_per_second": 0.343
8
  }
 
1
  {
2
+ "epoch": 46.15384615384615,
3
+ "eval_accuracy": 0.9338235294117647,
4
+ "eval_loss": 0.33108076453208923,
5
+ "eval_runtime": 29.2681,
6
+ "eval_samples_per_second": 4.647,
7
+ "eval_steps_per_second": 0.171
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 29.80392156862745,
3
- "total_flos": 2.827009127861453e+18,
4
- "train_loss": 0.3311853929046999,
5
- "train_runtime": 10131.822,
6
- "train_samples_per_second": 3.624,
7
- "train_steps_per_second": 0.113
8
  }
 
1
  {
2
+ "epoch": 46.15384615384615,
3
+ "total_flos": 4.3781443993328026e+18,
4
+ "train_loss": 0.41361336602105037,
5
+ "train_runtime": 14457.8356,
6
+ "train_samples_per_second": 4.233,
7
+ "train_steps_per_second": 0.031
8
  }
trainer_state.json CHANGED
@@ -1,1095 +1,765 @@
1
  {
2
- "best_metric": 0.9411764705882353,
3
- "best_model_checkpoint": "vit-base-patch16-224-in21k-finetuned-papsmear/checkpoint-765",
4
- "epoch": 29.80392156862745,
5
  "eval_steps": 500,
6
- "global_step": 1140,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.26143790849673204,
13
- "grad_norm": 1.524962306022644,
14
- "learning_rate": 4.3859649122807014e-06,
15
- "loss": 1.8303,
16
- "step": 10
17
- },
18
- {
19
- "epoch": 0.5228758169934641,
20
- "grad_norm": 1.3496487140655518,
21
- "learning_rate": 8.771929824561403e-06,
22
- "loss": 1.7834,
23
- "step": 20
24
- },
25
- {
26
- "epoch": 0.7843137254901961,
27
- "grad_norm": 1.857043981552124,
28
- "learning_rate": 1.3157894736842106e-05,
29
- "loss": 1.6954,
30
- "step": 30
31
- },
32
- {
33
- "epoch": 0.9934640522875817,
34
- "eval_accuracy": 0.34558823529411764,
35
- "eval_loss": 1.610603928565979,
36
- "eval_runtime": 33.3958,
37
- "eval_samples_per_second": 4.072,
38
- "eval_steps_per_second": 0.509,
39
- "step": 38
40
- },
41
- {
42
- "epoch": 1.0457516339869282,
43
- "grad_norm": 1.5567628145217896,
44
- "learning_rate": 1.7543859649122806e-05,
45
- "loss": 1.5872,
46
- "step": 40
47
  },
48
  {
49
- "epoch": 1.3071895424836601,
50
- "grad_norm": 1.5087599754333496,
51
- "learning_rate": 2.1929824561403507e-05,
52
- "loss": 1.4855,
53
- "step": 50
54
- },
55
- {
56
- "epoch": 1.5686274509803921,
57
- "grad_norm": 1.6773402690887451,
58
- "learning_rate": 2.6315789473684212e-05,
59
- "loss": 1.3823,
60
- "step": 60
61
- },
62
- {
63
- "epoch": 1.8300653594771243,
64
- "grad_norm": 1.4024033546447754,
65
- "learning_rate": 3.0701754385964913e-05,
66
- "loss": 1.2818,
67
- "step": 70
68
- },
69
- {
70
- "epoch": 1.9869281045751634,
71
- "eval_accuracy": 0.5735294117647058,
72
- "eval_loss": 1.2412389516830444,
73
- "eval_runtime": 32.7103,
74
- "eval_samples_per_second": 4.158,
75
- "eval_steps_per_second": 0.52,
76
- "step": 76
77
- },
78
- {
79
- "epoch": 2.0915032679738563,
80
- "grad_norm": 1.6061407327651978,
81
- "learning_rate": 3.508771929824561e-05,
82
- "loss": 1.2088,
83
- "step": 80
84
- },
85
- {
86
- "epoch": 2.3529411764705883,
87
- "grad_norm": 3.8964905738830566,
88
- "learning_rate": 3.9473684210526316e-05,
89
- "loss": 1.0983,
90
- "step": 90
91
- },
92
- {
93
- "epoch": 2.6143790849673203,
94
- "grad_norm": 3.24979305267334,
95
- "learning_rate": 4.3859649122807014e-05,
96
- "loss": 1.0632,
97
- "step": 100
98
- },
99
- {
100
- "epoch": 2.8758169934640523,
101
- "grad_norm": 2.1930079460144043,
102
- "learning_rate": 4.824561403508772e-05,
103
- "loss": 1.0023,
104
- "step": 110
105
  },
106
  {
107
- "epoch": 2.980392156862745,
108
- "eval_accuracy": 0.7132352941176471,
109
- "eval_loss": 0.9875355958938599,
110
- "eval_runtime": 32.8626,
111
- "eval_samples_per_second": 4.138,
112
- "eval_steps_per_second": 0.517,
113
- "step": 114
114
  },
115
  {
116
- "epoch": 3.1372549019607843,
117
- "grad_norm": 2.084902048110962,
118
- "learning_rate": 4.970760233918128e-05,
119
- "loss": 0.823,
120
- "step": 120
121
  },
122
  {
123
- "epoch": 3.3986928104575163,
124
- "grad_norm": 1.8761796951293945,
125
- "learning_rate": 4.9220272904483435e-05,
126
- "loss": 0.785,
127
- "step": 130
 
 
128
  },
129
  {
130
- "epoch": 3.6601307189542482,
131
- "grad_norm": 3.031520128250122,
132
- "learning_rate": 4.8732943469785574e-05,
133
- "loss": 0.7775,
134
- "step": 140
135
- },
136
- {
137
- "epoch": 3.9215686274509802,
138
- "grad_norm": 2.1056406497955322,
139
- "learning_rate": 4.824561403508772e-05,
140
- "loss": 0.7163,
141
- "step": 150
142
  },
143
  {
144
  "epoch": 4.0,
145
- "eval_accuracy": 0.6911764705882353,
146
- "eval_loss": 0.8399065136909485,
147
- "eval_runtime": 32.9831,
148
- "eval_samples_per_second": 4.123,
149
- "eval_steps_per_second": 0.515,
150
- "step": 153
151
- },
152
- {
153
- "epoch": 4.183006535947713,
154
- "grad_norm": 3.3159000873565674,
155
- "learning_rate": 4.7758284600389865e-05,
156
- "loss": 0.6677,
157
- "step": 160
158
- },
159
- {
160
- "epoch": 4.444444444444445,
161
- "grad_norm": 2.1595349311828613,
162
- "learning_rate": 4.727095516569201e-05,
163
- "loss": 0.6366,
164
- "step": 170
165
- },
166
- {
167
- "epoch": 4.705882352941177,
168
- "grad_norm": 2.2182297706604004,
169
- "learning_rate": 4.678362573099415e-05,
170
- "loss": 0.5876,
171
- "step": 180
172
- },
173
- {
174
- "epoch": 4.967320261437909,
175
- "grad_norm": 5.272632122039795,
176
- "learning_rate": 4.62962962962963e-05,
177
- "loss": 0.5173,
178
- "step": 190
179
- },
180
- {
181
- "epoch": 4.993464052287582,
182
- "eval_accuracy": 0.8161764705882353,
183
- "eval_loss": 0.6545882821083069,
184
- "eval_runtime": 31.9706,
185
- "eval_samples_per_second": 4.254,
186
- "eval_steps_per_second": 0.532,
187
- "step": 191
188
- },
189
- {
190
- "epoch": 5.228758169934641,
191
- "grad_norm": 5.355340003967285,
192
- "learning_rate": 4.580896686159844e-05,
193
- "loss": 0.5057,
194
- "step": 200
195
- },
196
- {
197
- "epoch": 5.490196078431373,
198
- "grad_norm": 2.3517274856567383,
199
- "learning_rate": 4.5321637426900585e-05,
200
- "loss": 0.4577,
201
- "step": 210
202
- },
203
- {
204
- "epoch": 5.751633986928105,
205
- "grad_norm": 7.760730266571045,
206
- "learning_rate": 4.483430799220273e-05,
207
- "loss": 0.5057,
208
- "step": 220
209
- },
210
- {
211
- "epoch": 5.9869281045751634,
212
- "eval_accuracy": 0.8308823529411765,
213
- "eval_loss": 0.6250559091567993,
214
- "eval_runtime": 32.3916,
215
- "eval_samples_per_second": 4.199,
216
- "eval_steps_per_second": 0.525,
217
- "step": 229
218
- },
219
- {
220
- "epoch": 6.0130718954248366,
221
- "grad_norm": 3.0986075401306152,
222
- "learning_rate": 4.4346978557504876e-05,
223
- "loss": 0.4856,
224
- "step": 230
225
- },
226
- {
227
- "epoch": 6.2745098039215685,
228
- "grad_norm": 7.31836462020874,
229
- "learning_rate": 4.3859649122807014e-05,
230
- "loss": 0.5036,
231
- "step": 240
232
- },
233
- {
234
- "epoch": 6.5359477124183005,
235
- "grad_norm": 2.5667953491210938,
236
- "learning_rate": 4.3372319688109166e-05,
237
- "loss": 0.4063,
238
- "step": 250
239
  },
240
  {
241
- "epoch": 6.7973856209150325,
242
- "grad_norm": 2.488330602645874,
243
- "learning_rate": 4.2884990253411305e-05,
244
- "loss": 0.4313,
245
- "step": 260
 
 
246
  },
247
  {
248
- "epoch": 6.980392156862745,
249
- "eval_accuracy": 0.8308823529411765,
250
- "eval_loss": 0.5696418881416321,
251
- "eval_runtime": 32.7903,
252
- "eval_samples_per_second": 4.148,
253
- "eval_steps_per_second": 0.518,
254
- "step": 267
255
  },
256
  {
257
- "epoch": 7.0588235294117645,
258
- "grad_norm": 4.5028228759765625,
259
- "learning_rate": 4.239766081871345e-05,
260
- "loss": 0.3409,
261
- "step": 270
 
 
262
  },
263
  {
264
- "epoch": 7.3202614379084965,
265
- "grad_norm": 3.8160030841827393,
266
- "learning_rate": 4.1910331384015596e-05,
267
- "loss": 0.4005,
268
- "step": 280
269
  },
270
  {
271
- "epoch": 7.5816993464052285,
272
- "grad_norm": 7.039416790008545,
273
- "learning_rate": 4.142300194931774e-05,
274
- "loss": 0.4047,
275
- "step": 290
 
 
276
  },
277
  {
278
- "epoch": 7.8431372549019605,
279
- "grad_norm": 2.3010506629943848,
280
- "learning_rate": 4.093567251461988e-05,
281
- "loss": 0.325,
282
- "step": 300
283
  },
284
  {
285
  "epoch": 8.0,
286
- "eval_accuracy": 0.8308823529411765,
287
- "eval_loss": 0.5506752133369446,
288
- "eval_runtime": 33.2532,
289
- "eval_samples_per_second": 4.09,
290
- "eval_steps_per_second": 0.511,
291
- "step": 306
292
- },
293
- {
294
- "epoch": 8.104575163398692,
295
- "grad_norm": 2.194329023361206,
296
- "learning_rate": 4.044834307992203e-05,
297
- "loss": 0.3016,
298
- "step": 310
299
- },
300
- {
301
- "epoch": 8.366013071895425,
302
- "grad_norm": 4.172213554382324,
303
- "learning_rate": 3.996101364522417e-05,
304
- "loss": 0.3194,
305
- "step": 320
306
- },
307
- {
308
- "epoch": 8.627450980392156,
309
- "grad_norm": 3.775691270828247,
310
- "learning_rate": 3.9473684210526316e-05,
311
- "loss": 0.3115,
312
- "step": 330
313
- },
314
- {
315
- "epoch": 8.88888888888889,
316
- "grad_norm": 3.285763740539551,
317
- "learning_rate": 3.898635477582846e-05,
318
- "loss": 0.3811,
319
- "step": 340
320
- },
321
- {
322
- "epoch": 8.993464052287582,
323
- "eval_accuracy": 0.8676470588235294,
324
- "eval_loss": 0.4429008662700653,
325
- "eval_runtime": 32.2721,
326
- "eval_samples_per_second": 4.214,
327
- "eval_steps_per_second": 0.527,
328
- "step": 344
329
- },
330
- {
331
- "epoch": 9.15032679738562,
332
- "grad_norm": 2.8171193599700928,
333
- "learning_rate": 3.849902534113061e-05,
334
- "loss": 0.3617,
335
- "step": 350
336
- },
337
- {
338
- "epoch": 9.411764705882353,
339
- "grad_norm": 2.766669273376465,
340
- "learning_rate": 3.8011695906432746e-05,
341
- "loss": 0.2679,
342
- "step": 360
343
- },
344
- {
345
- "epoch": 9.673202614379084,
346
- "grad_norm": 5.8598833084106445,
347
- "learning_rate": 3.75243664717349e-05,
348
- "loss": 0.2909,
349
- "step": 370
350
- },
351
- {
352
- "epoch": 9.934640522875817,
353
- "grad_norm": 4.5529608726501465,
354
- "learning_rate": 3.7037037037037037e-05,
355
- "loss": 0.2341,
356
- "step": 380
357
- },
358
- {
359
- "epoch": 9.986928104575163,
360
- "eval_accuracy": 0.875,
361
- "eval_loss": 0.42220985889434814,
362
- "eval_runtime": 32.0969,
363
- "eval_samples_per_second": 4.237,
364
- "eval_steps_per_second": 0.53,
365
- "step": 382
366
- },
367
- {
368
- "epoch": 10.196078431372548,
369
- "grad_norm": 3.98348069190979,
370
- "learning_rate": 3.654970760233918e-05,
371
- "loss": 0.2612,
372
- "step": 390
373
  },
374
  {
375
- "epoch": 10.457516339869281,
376
- "grad_norm": 5.887331485748291,
377
- "learning_rate": 3.606237816764133e-05,
378
- "loss": 0.2518,
379
- "step": 400
 
 
380
  },
381
  {
382
- "epoch": 10.718954248366012,
383
- "grad_norm": 3.759489059448242,
384
- "learning_rate": 3.557504873294347e-05,
385
- "loss": 0.2361,
386
- "step": 410
387
- },
388
- {
389
- "epoch": 10.980392156862745,
390
- "grad_norm": 6.444194793701172,
391
- "learning_rate": 3.508771929824561e-05,
392
- "loss": 0.3082,
393
- "step": 420
394
  },
395
  {
396
- "epoch": 10.980392156862745,
397
- "eval_accuracy": 0.7720588235294118,
398
- "eval_loss": 0.6572562456130981,
399
- "eval_runtime": 32.1021,
400
- "eval_samples_per_second": 4.236,
401
- "eval_steps_per_second": 0.53,
402
- "step": 420
403
  },
404
  {
405
- "epoch": 11.241830065359476,
406
- "grad_norm": 2.5144402980804443,
407
- "learning_rate": 3.4600389863547764e-05,
408
- "loss": 0.2284,
409
- "step": 430
410
  },
411
  {
412
- "epoch": 11.50326797385621,
413
- "grad_norm": 5.127187728881836,
414
- "learning_rate": 3.41130604288499e-05,
415
- "loss": 0.2353,
416
- "step": 440
 
 
417
  },
418
  {
419
- "epoch": 11.764705882352942,
420
- "grad_norm": 3.2781078815460205,
421
- "learning_rate": 3.362573099415205e-05,
422
- "loss": 0.2571,
423
- "step": 450
424
  },
425
  {
426
  "epoch": 12.0,
427
- "eval_accuracy": 0.8897058823529411,
428
- "eval_loss": 0.4229148328304291,
429
- "eval_runtime": 31.8733,
430
- "eval_samples_per_second": 4.267,
431
- "eval_steps_per_second": 0.533,
432
- "step": 459
433
- },
434
- {
435
- "epoch": 12.026143790849673,
436
- "grad_norm": 1.4445286989212036,
437
- "learning_rate": 3.313840155945419e-05,
438
- "loss": 0.2537,
439
- "step": 460
440
- },
441
- {
442
- "epoch": 12.287581699346406,
443
- "grad_norm": 3.5303471088409424,
444
- "learning_rate": 3.265107212475634e-05,
445
- "loss": 0.23,
446
- "step": 470
447
- },
448
- {
449
- "epoch": 12.549019607843137,
450
- "grad_norm": 4.927082538604736,
451
- "learning_rate": 3.216374269005848e-05,
452
- "loss": 0.1693,
453
- "step": 480
454
- },
455
- {
456
- "epoch": 12.81045751633987,
457
- "grad_norm": 1.5047080516815186,
458
- "learning_rate": 3.167641325536063e-05,
459
- "loss": 0.2374,
460
- "step": 490
461
- },
462
- {
463
- "epoch": 12.993464052287582,
464
- "eval_accuracy": 0.875,
465
- "eval_loss": 0.42333418130874634,
466
- "eval_runtime": 32.071,
467
- "eval_samples_per_second": 4.241,
468
- "eval_steps_per_second": 0.53,
469
- "step": 497
470
  },
471
  {
472
- "epoch": 13.071895424836601,
473
- "grad_norm": 1.6010866165161133,
474
- "learning_rate": 3.118908382066277e-05,
475
- "loss": 0.2091,
476
- "step": 500
 
 
477
  },
478
  {
479
  "epoch": 13.333333333333334,
480
- "grad_norm": 1.0056742429733276,
481
- "learning_rate": 3.0701754385964913e-05,
482
- "loss": 0.2203,
483
- "step": 510
484
- },
485
- {
486
- "epoch": 13.594771241830065,
487
- "grad_norm": 0.8992928266525269,
488
- "learning_rate": 3.0214424951267055e-05,
489
- "loss": 0.1784,
490
- "step": 520
491
- },
492
- {
493
- "epoch": 13.856209150326798,
494
- "grad_norm": 1.4664764404296875,
495
- "learning_rate": 2.9727095516569204e-05,
496
- "loss": 0.128,
497
- "step": 530
498
- },
499
- {
500
- "epoch": 13.986928104575163,
501
- "eval_accuracy": 0.8970588235294118,
502
- "eval_loss": 0.367097944021225,
503
- "eval_runtime": 31.5031,
504
- "eval_samples_per_second": 4.317,
505
- "eval_steps_per_second": 0.54,
506
- "step": 535
507
  },
508
  {
509
- "epoch": 14.117647058823529,
510
- "grad_norm": 2.8378169536590576,
511
- "learning_rate": 2.9239766081871346e-05,
512
- "loss": 0.1436,
513
- "step": 540
 
 
514
  },
515
  {
516
- "epoch": 14.379084967320262,
517
- "grad_norm": 1.6675957441329956,
518
- "learning_rate": 2.875243664717349e-05,
519
- "loss": 0.1986,
520
- "step": 550
521
  },
522
  {
523
- "epoch": 14.640522875816993,
524
- "grad_norm": 8.45456314086914,
525
- "learning_rate": 2.8265107212475634e-05,
526
- "loss": 0.1711,
527
- "step": 560
 
 
528
  },
529
  {
530
- "epoch": 14.901960784313726,
531
- "grad_norm": 6.542934417724609,
532
- "learning_rate": 2.777777777777778e-05,
533
- "loss": 0.1718,
534
- "step": 570
535
  },
536
  {
537
- "epoch": 14.980392156862745,
538
  "eval_accuracy": 0.8970588235294118,
539
- "eval_loss": 0.34296926856040955,
540
- "eval_runtime": 31.8312,
541
- "eval_samples_per_second": 4.273,
542
- "eval_steps_per_second": 0.534,
543
- "step": 573
544
- },
545
- {
546
- "epoch": 15.163398692810457,
547
- "grad_norm": 4.025688171386719,
548
- "learning_rate": 2.729044834307992e-05,
549
- "loss": 0.1681,
550
- "step": 580
551
- },
552
- {
553
- "epoch": 15.42483660130719,
554
- "grad_norm": 0.5761296153068542,
555
- "learning_rate": 2.680311890838207e-05,
556
- "loss": 0.1558,
557
- "step": 590
558
  },
559
  {
560
- "epoch": 15.686274509803921,
561
- "grad_norm": 3.5217578411102295,
562
- "learning_rate": 2.6315789473684212e-05,
563
- "loss": 0.1413,
564
- "step": 600
565
  },
566
  {
567
- "epoch": 15.947712418300654,
568
- "grad_norm": 5.222166538238525,
569
- "learning_rate": 2.5828460038986357e-05,
570
- "loss": 0.16,
571
- "step": 610
 
 
572
  },
573
  {
574
- "epoch": 16.0,
575
- "eval_accuracy": 0.875,
576
- "eval_loss": 0.41041284799575806,
577
- "eval_runtime": 32.0124,
578
- "eval_samples_per_second": 4.248,
579
- "eval_steps_per_second": 0.531,
580
- "step": 612
581
  },
582
  {
583
- "epoch": 16.209150326797385,
584
- "grad_norm": 4.111217021942139,
585
- "learning_rate": 2.53411306042885e-05,
586
- "loss": 0.136,
587
- "step": 620
 
 
588
  },
589
  {
590
- "epoch": 16.470588235294116,
591
- "grad_norm": 4.035433769226074,
592
- "learning_rate": 2.485380116959064e-05,
593
- "loss": 0.1369,
594
- "step": 630
595
  },
596
  {
597
- "epoch": 16.73202614379085,
598
- "grad_norm": 1.081305980682373,
599
- "learning_rate": 2.4366471734892787e-05,
600
- "loss": 0.1331,
601
- "step": 640
 
 
602
  },
603
  {
604
- "epoch": 16.99346405228758,
605
- "grad_norm": 0.21553057432174683,
606
- "learning_rate": 2.3879142300194932e-05,
607
- "loss": 0.1096,
608
- "step": 650
609
  },
610
  {
611
- "epoch": 16.99346405228758,
612
  "eval_accuracy": 0.9117647058823529,
613
- "eval_loss": 0.2919999659061432,
614
- "eval_runtime": 32.5572,
615
- "eval_samples_per_second": 4.177,
616
- "eval_steps_per_second": 0.522,
617
- "step": 650
618
  },
619
  {
620
- "epoch": 17.254901960784313,
621
- "grad_norm": 5.312492847442627,
622
- "learning_rate": 2.3391812865497074e-05,
623
- "loss": 0.0943,
624
- "step": 660
625
- },
626
- {
627
- "epoch": 17.516339869281047,
628
- "grad_norm": 0.38019096851348877,
629
- "learning_rate": 2.290448343079922e-05,
630
- "loss": 0.1058,
631
- "step": 670
632
- },
633
- {
634
- "epoch": 17.77777777777778,
635
- "grad_norm": 9.7896089553833,
636
- "learning_rate": 2.2417153996101365e-05,
637
- "loss": 0.1408,
638
- "step": 680
639
  },
640
  {
641
- "epoch": 17.986928104575163,
642
  "eval_accuracy": 0.9044117647058824,
643
- "eval_loss": 0.26298093795776367,
644
- "eval_runtime": 31.7137,
645
- "eval_samples_per_second": 4.288,
646
- "eval_steps_per_second": 0.536,
647
- "step": 688
648
- },
649
- {
650
- "epoch": 18.03921568627451,
651
- "grad_norm": 1.3320598602294922,
652
- "learning_rate": 2.1929824561403507e-05,
653
- "loss": 0.1352,
654
- "step": 690
655
  },
656
  {
657
- "epoch": 18.30065359477124,
658
- "grad_norm": 1.0650367736816406,
659
- "learning_rate": 2.1442495126705653e-05,
660
- "loss": 0.1149,
661
- "step": 700
662
  },
663
  {
664
- "epoch": 18.562091503267975,
665
- "grad_norm": 0.5184645056724548,
666
- "learning_rate": 2.0955165692007798e-05,
667
- "loss": 0.1414,
668
- "step": 710
 
 
669
  },
670
  {
671
- "epoch": 18.823529411764707,
672
- "grad_norm": 0.20009098947048187,
673
- "learning_rate": 2.046783625730994e-05,
674
- "loss": 0.113,
675
- "step": 720
676
  },
677
  {
678
- "epoch": 18.980392156862745,
679
- "eval_accuracy": 0.8823529411764706,
680
- "eval_loss": 0.30839478969573975,
681
- "eval_runtime": 31.8589,
682
- "eval_samples_per_second": 4.269,
683
- "eval_steps_per_second": 0.534,
684
- "step": 726
685
  },
686
  {
687
- "epoch": 19.084967320261438,
688
- "grad_norm": 0.23899152874946594,
689
- "learning_rate": 1.9980506822612085e-05,
690
- "loss": 0.082,
691
- "step": 730
692
  },
693
  {
694
- "epoch": 19.34640522875817,
695
- "grad_norm": 2.6471519470214844,
696
- "learning_rate": 1.949317738791423e-05,
697
- "loss": 0.0818,
698
- "step": 740
 
 
699
  },
700
  {
701
- "epoch": 19.607843137254903,
702
- "grad_norm": 3.0529627799987793,
703
- "learning_rate": 1.9005847953216373e-05,
704
- "loss": 0.1271,
705
- "step": 750
706
  },
707
  {
708
- "epoch": 19.869281045751634,
709
- "grad_norm": 0.2476678043603897,
710
- "learning_rate": 1.8518518518518518e-05,
711
- "loss": 0.1272,
712
- "step": 760
 
 
713
  },
714
  {
715
- "epoch": 20.0,
716
- "eval_accuracy": 0.9411764705882353,
717
- "eval_loss": 0.2523283064365387,
718
- "eval_runtime": 31.804,
719
- "eval_samples_per_second": 4.276,
720
- "eval_steps_per_second": 0.535,
721
- "step": 765
722
  },
723
  {
724
- "epoch": 20.130718954248366,
725
- "grad_norm": 11.856123924255371,
726
- "learning_rate": 1.8031189083820664e-05,
727
- "loss": 0.1052,
728
- "step": 770
 
 
729
  },
730
  {
731
- "epoch": 20.392156862745097,
732
- "grad_norm": 3.1671411991119385,
733
- "learning_rate": 1.7543859649122806e-05,
734
- "loss": 0.1264,
735
- "step": 780
736
  },
737
  {
738
- "epoch": 20.65359477124183,
739
- "grad_norm": 0.7181591391563416,
740
- "learning_rate": 1.705653021442495e-05,
741
- "loss": 0.1141,
742
- "step": 790
 
 
743
  },
744
  {
745
- "epoch": 20.915032679738562,
746
- "grad_norm": 3.510709762573242,
747
- "learning_rate": 1.6569200779727097e-05,
748
- "loss": 0.119,
749
- "step": 800
750
  },
751
  {
752
- "epoch": 20.99346405228758,
753
  "eval_accuracy": 0.8823529411764706,
754
- "eval_loss": 0.42536023259162903,
755
- "eval_runtime": 32.0193,
756
- "eval_samples_per_second": 4.247,
757
- "eval_steps_per_second": 0.531,
758
- "step": 803
759
- },
760
- {
761
- "epoch": 21.176470588235293,
762
- "grad_norm": 2.389382839202881,
763
- "learning_rate": 1.608187134502924e-05,
764
- "loss": 0.1316,
765
- "step": 810
766
  },
767
  {
768
- "epoch": 21.437908496732025,
769
- "grad_norm": 6.068481922149658,
770
- "learning_rate": 1.5594541910331384e-05,
771
- "loss": 0.1126,
772
- "step": 820
773
  },
774
  {
775
- "epoch": 21.69934640522876,
776
- "grad_norm": 0.7834749817848206,
777
- "learning_rate": 1.5107212475633528e-05,
778
- "loss": 0.0782,
779
- "step": 830
 
 
780
  },
781
  {
782
- "epoch": 21.96078431372549,
783
- "grad_norm": 2.6613547801971436,
784
- "learning_rate": 1.4619883040935673e-05,
785
- "loss": 0.1068,
786
- "step": 840
787
  },
788
  {
789
- "epoch": 21.986928104575163,
790
- "eval_accuracy": 0.8970588235294118,
791
- "eval_loss": 0.35191574692726135,
792
- "eval_runtime": 31.8243,
793
- "eval_samples_per_second": 4.273,
794
- "eval_steps_per_second": 0.534,
795
- "step": 841
796
  },
797
  {
798
- "epoch": 22.22222222222222,
799
- "grad_norm": 2.7536206245422363,
800
- "learning_rate": 1.4132553606237817e-05,
801
- "loss": 0.1058,
802
- "step": 850
803
  },
804
  {
805
- "epoch": 22.483660130718953,
806
- "grad_norm": 5.085578441619873,
807
- "learning_rate": 1.364522417153996e-05,
808
- "loss": 0.0847,
809
- "step": 860
 
 
810
  },
811
  {
812
- "epoch": 22.745098039215687,
813
- "grad_norm": 1.596222996711731,
814
- "learning_rate": 1.3157894736842106e-05,
815
- "loss": 0.0723,
816
- "step": 870
817
  },
818
  {
819
- "epoch": 22.980392156862745,
820
  "eval_accuracy": 0.9191176470588235,
821
- "eval_loss": 0.32932135462760925,
822
- "eval_runtime": 33.5379,
823
- "eval_samples_per_second": 4.055,
824
- "eval_steps_per_second": 0.507,
825
- "step": 879
826
- },
827
- {
828
- "epoch": 23.00653594771242,
829
- "grad_norm": 1.2901531457901,
830
- "learning_rate": 1.267056530214425e-05,
831
- "loss": 0.0937,
832
- "step": 880
833
  },
834
  {
835
- "epoch": 23.26797385620915,
836
- "grad_norm": 0.6107073426246643,
837
- "learning_rate": 1.2183235867446393e-05,
838
- "loss": 0.076,
839
- "step": 890
840
- },
841
- {
842
- "epoch": 23.529411764705884,
843
- "grad_norm": 0.1317284107208252,
844
- "learning_rate": 1.1695906432748537e-05,
845
- "loss": 0.0742,
846
- "step": 900
847
- },
848
- {
849
- "epoch": 23.790849673202615,
850
- "grad_norm": 1.982415795326233,
851
- "learning_rate": 1.1208576998050683e-05,
852
- "loss": 0.0769,
853
- "step": 910
854
  },
855
  {
856
- "epoch": 24.0,
857
  "eval_accuracy": 0.9264705882352942,
858
- "eval_loss": 0.2613476812839508,
859
- "eval_runtime": 32.3164,
860
- "eval_samples_per_second": 4.208,
861
- "eval_steps_per_second": 0.526,
862
- "step": 918
863
  },
864
  {
865
- "epoch": 24.052287581699346,
866
- "grad_norm": 0.13533662259578705,
867
- "learning_rate": 1.0721247563352826e-05,
868
- "loss": 0.0872,
869
- "step": 920
870
  },
871
  {
872
- "epoch": 24.313725490196077,
873
- "grad_norm": 0.18768663704395294,
874
- "learning_rate": 1.023391812865497e-05,
875
- "loss": 0.0836,
876
- "step": 930
 
 
877
  },
878
  {
879
- "epoch": 24.575163398692812,
880
- "grad_norm": 1.9041831493377686,
881
- "learning_rate": 9.746588693957115e-06,
882
- "loss": 0.0968,
883
- "step": 940
884
  },
885
  {
886
- "epoch": 24.836601307189543,
887
- "grad_norm": 1.782492756843567,
888
- "learning_rate": 9.259259259259259e-06,
889
- "loss": 0.095,
890
- "step": 950
 
 
891
  },
892
  {
893
- "epoch": 24.99346405228758,
894
- "eval_accuracy": 0.9411764705882353,
895
- "eval_loss": 0.26091232895851135,
896
- "eval_runtime": 32.1423,
897
- "eval_samples_per_second": 4.231,
898
- "eval_steps_per_second": 0.529,
899
- "step": 956
900
  },
901
  {
902
- "epoch": 25.098039215686274,
903
- "grad_norm": 0.38502180576324463,
904
- "learning_rate": 8.771929824561403e-06,
905
- "loss": 0.0871,
906
- "step": 960
 
 
907
  },
908
  {
909
- "epoch": 25.359477124183005,
910
- "grad_norm": 7.287825584411621,
911
- "learning_rate": 8.284600389863548e-06,
912
- "loss": 0.0662,
913
- "step": 970
914
  },
915
  {
916
- "epoch": 25.62091503267974,
917
- "grad_norm": 3.129642963409424,
918
- "learning_rate": 7.797270955165692e-06,
919
- "loss": 0.0506,
920
- "step": 980
 
 
921
  },
922
  {
923
- "epoch": 25.88235294117647,
924
- "grad_norm": 0.24554868042469025,
925
- "learning_rate": 7.3099415204678366e-06,
926
- "loss": 0.0863,
927
- "step": 990
928
  },
929
  {
930
- "epoch": 25.986928104575163,
931
  "eval_accuracy": 0.9264705882352942,
932
- "eval_loss": 0.264960914850235,
933
- "eval_runtime": 32.1045,
934
- "eval_samples_per_second": 4.236,
935
- "eval_steps_per_second": 0.53,
936
- "step": 994
937
- },
938
- {
939
- "epoch": 26.143790849673202,
940
- "grad_norm": 1.0143533945083618,
941
- "learning_rate": 6.82261208576998e-06,
942
- "loss": 0.0604,
943
- "step": 1000
944
  },
945
  {
946
- "epoch": 26.405228758169933,
947
- "grad_norm": 0.31393057107925415,
948
- "learning_rate": 6.335282651072125e-06,
949
- "loss": 0.0589,
950
- "step": 1010
951
  },
952
  {
953
- "epoch": 26.666666666666668,
954
- "grad_norm": 5.615325450897217,
955
- "learning_rate": 5.8479532163742686e-06,
956
- "loss": 0.0654,
957
- "step": 1020
 
 
958
  },
959
  {
960
- "epoch": 26.9281045751634,
961
- "grad_norm": 0.4632560610771179,
962
- "learning_rate": 5.360623781676413e-06,
963
- "loss": 0.0795,
964
- "step": 1030
965
  },
966
  {
967
- "epoch": 26.980392156862745,
968
- "eval_accuracy": 0.9117647058823529,
969
- "eval_loss": 0.297785222530365,
970
- "eval_runtime": 31.5613,
971
- "eval_samples_per_second": 4.309,
972
- "eval_steps_per_second": 0.539,
973
- "step": 1032
974
  },
975
  {
976
- "epoch": 27.18954248366013,
977
- "grad_norm": 0.2585015594959259,
978
- "learning_rate": 4.873294346978558e-06,
979
- "loss": 0.0719,
980
- "step": 1040
 
 
981
  },
982
  {
983
- "epoch": 27.45098039215686,
984
- "grad_norm": 0.364827960729599,
985
- "learning_rate": 4.3859649122807014e-06,
986
- "loss": 0.0837,
987
- "step": 1050
988
  },
989
  {
990
- "epoch": 27.712418300653596,
991
- "grad_norm": 0.14460325241088867,
992
- "learning_rate": 3.898635477582846e-06,
993
- "loss": 0.0483,
994
- "step": 1060
 
 
995
  },
996
  {
997
- "epoch": 27.973856209150327,
998
- "grad_norm": 0.43661826848983765,
999
- "learning_rate": 3.41130604288499e-06,
1000
- "loss": 0.0564,
1001
- "step": 1070
1002
  },
1003
  {
1004
- "epoch": 28.0,
1005
- "eval_accuracy": 0.9191176470588235,
1006
- "eval_loss": 0.2736634314060211,
1007
- "eval_runtime": 31.9037,
1008
- "eval_samples_per_second": 4.263,
1009
- "eval_steps_per_second": 0.533,
1010
- "step": 1071
1011
  },
1012
  {
1013
- "epoch": 28.235294117647058,
1014
- "grad_norm": 0.2871550917625427,
1015
- "learning_rate": 2.9239766081871343e-06,
1016
- "loss": 0.0676,
1017
- "step": 1080
1018
  },
1019
  {
1020
- "epoch": 28.49673202614379,
1021
- "grad_norm": 0.19494622945785522,
1022
- "learning_rate": 2.436647173489279e-06,
1023
- "loss": 0.055,
1024
- "step": 1090
 
 
1025
  },
1026
  {
1027
- "epoch": 28.758169934640524,
1028
- "grad_norm": 0.46231183409690857,
1029
- "learning_rate": 1.949317738791423e-06,
1030
- "loss": 0.0562,
1031
- "step": 1100
1032
  },
1033
  {
1034
- "epoch": 28.99346405228758,
1035
- "eval_accuracy": 0.9191176470588235,
1036
- "eval_loss": 0.2940830886363983,
1037
- "eval_runtime": 32.3077,
1038
- "eval_samples_per_second": 4.21,
1039
- "eval_steps_per_second": 0.526,
1040
- "step": 1109
1041
  },
1042
  {
1043
- "epoch": 29.019607843137255,
1044
- "grad_norm": 0.13441592454910278,
1045
- "learning_rate": 1.4619883040935671e-06,
1046
- "loss": 0.049,
1047
- "step": 1110
1048
  },
1049
  {
1050
- "epoch": 29.281045751633986,
1051
- "grad_norm": 5.47345495223999,
1052
- "learning_rate": 9.746588693957115e-07,
1053
- "loss": 0.0712,
1054
- "step": 1120
 
 
1055
  },
1056
  {
1057
- "epoch": 29.54248366013072,
1058
- "grad_norm": 0.6587017178535461,
1059
- "learning_rate": 4.873294346978557e-07,
1060
- "loss": 0.0679,
1061
- "step": 1130
1062
  },
1063
  {
1064
- "epoch": 29.80392156862745,
1065
- "grad_norm": 0.12627951800823212,
1066
- "learning_rate": 0.0,
1067
- "loss": 0.0751,
1068
- "step": 1140
 
 
1069
  },
1070
  {
1071
- "epoch": 29.80392156862745,
1072
- "eval_accuracy": 0.9191176470588235,
1073
- "eval_loss": 0.3110995590686798,
1074
- "eval_runtime": 33.2625,
1075
- "eval_samples_per_second": 4.089,
1076
- "eval_steps_per_second": 0.511,
1077
- "step": 1140
1078
- },
1079
- {
1080
- "epoch": 29.80392156862745,
1081
- "step": 1140,
1082
- "total_flos": 2.827009127861453e+18,
1083
- "train_loss": 0.3311853929046999,
1084
- "train_runtime": 10131.822,
1085
- "train_samples_per_second": 3.624,
1086
- "train_steps_per_second": 0.113
1087
  }
1088
  ],
1089
  "logging_steps": 10,
1090
- "max_steps": 1140,
1091
  "num_input_tokens_seen": 0,
1092
- "num_train_epochs": 30,
1093
  "save_steps": 500,
1094
  "stateful_callbacks": {
1095
  "TrainerControl": {
@@ -1103,8 +773,8 @@
1103
  "attributes": {}
1104
  }
1105
  },
1106
- "total_flos": 2.827009127861453e+18,
1107
- "train_batch_size": 8,
1108
  "trial_name": null,
1109
  "trial_params": null
1110
  }
 
1
  {
2
+ "best_metric": 0.9338235294117647,
3
+ "best_model_checkpoint": "vit-base-patch16-224-in21k-finetuned-papsmear/checkpoint-341",
4
+ "epoch": 46.15384615384615,
5
  "eval_steps": 500,
6
+ "global_step": 450,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.9230769230769231,
13
+ "eval_accuracy": 0.20588235294117646,
14
+ "eval_loss": 1.7879393100738525,
15
+ "eval_runtime": 31.0661,
16
+ "eval_samples_per_second": 4.378,
17
+ "eval_steps_per_second": 0.161,
18
+ "step": 9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  },
20
  {
21
+ "epoch": 1.0256410256410255,
22
+ "grad_norm": 1.2670034170150757,
23
+ "learning_rate": 1.1111111111111112e-05,
24
+ "loss": 1.8037,
25
+ "step": 10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  },
27
  {
28
+ "epoch": 1.9487179487179487,
29
+ "eval_accuracy": 0.40441176470588236,
30
+ "eval_loss": 1.648475170135498,
31
+ "eval_runtime": 32.5892,
32
+ "eval_samples_per_second": 4.173,
33
+ "eval_steps_per_second": 0.153,
34
+ "step": 19
35
  },
36
  {
37
+ "epoch": 2.051282051282051,
38
+ "grad_norm": 1.1165878772735596,
39
+ "learning_rate": 2.2222222222222223e-05,
40
+ "loss": 1.6961,
41
+ "step": 20
42
  },
43
  {
44
+ "epoch": 2.9743589743589745,
45
+ "eval_accuracy": 0.39705882352941174,
46
+ "eval_loss": 1.4881964921951294,
47
+ "eval_runtime": 31.9851,
48
+ "eval_samples_per_second": 4.252,
49
+ "eval_steps_per_second": 0.156,
50
+ "step": 29
51
  },
52
  {
53
+ "epoch": 3.076923076923077,
54
+ "grad_norm": 1.1624480485916138,
55
+ "learning_rate": 3.3333333333333335e-05,
56
+ "loss": 1.5407,
57
+ "step": 30
 
 
 
 
 
 
 
58
  },
59
  {
60
  "epoch": 4.0,
61
+ "eval_accuracy": 0.5220588235294118,
62
+ "eval_loss": 1.3068838119506836,
63
+ "eval_runtime": 31.423,
64
+ "eval_samples_per_second": 4.328,
65
+ "eval_steps_per_second": 0.159,
66
+ "step": 39
67
+ },
68
+ {
69
+ "epoch": 4.102564102564102,
70
+ "grad_norm": 0.9998506307601929,
71
+ "learning_rate": 4.4444444444444447e-05,
72
+ "loss": 1.3308,
73
+ "step": 40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  },
75
  {
76
+ "epoch": 4.923076923076923,
77
+ "eval_accuracy": 0.6029411764705882,
78
+ "eval_loss": 1.1338790655136108,
79
+ "eval_runtime": 30.9165,
80
+ "eval_samples_per_second": 4.399,
81
+ "eval_steps_per_second": 0.162,
82
+ "step": 48
83
  },
84
  {
85
+ "epoch": 5.128205128205128,
86
+ "grad_norm": 1.2114465236663818,
87
+ "learning_rate": 4.938271604938271e-05,
88
+ "loss": 1.1074,
89
+ "step": 50
 
 
90
  },
91
  {
92
+ "epoch": 5.948717948717949,
93
+ "eval_accuracy": 0.75,
94
+ "eval_loss": 0.9395625591278076,
95
+ "eval_runtime": 30.6345,
96
+ "eval_samples_per_second": 4.439,
97
+ "eval_steps_per_second": 0.163,
98
+ "step": 58
99
  },
100
  {
101
+ "epoch": 6.153846153846154,
102
+ "grad_norm": 1.676944375038147,
103
+ "learning_rate": 4.814814814814815e-05,
104
+ "loss": 0.9162,
105
+ "step": 60
106
  },
107
  {
108
+ "epoch": 6.9743589743589745,
109
+ "eval_accuracy": 0.7647058823529411,
110
+ "eval_loss": 0.8551253080368042,
111
+ "eval_runtime": 30.343,
112
+ "eval_samples_per_second": 4.482,
113
+ "eval_steps_per_second": 0.165,
114
+ "step": 68
115
  },
116
  {
117
+ "epoch": 7.17948717948718,
118
+ "grad_norm": 1.8491935729980469,
119
+ "learning_rate": 4.691358024691358e-05,
120
+ "loss": 0.8174,
121
+ "step": 70
122
  },
123
  {
124
  "epoch": 8.0,
125
+ "eval_accuracy": 0.7573529411764706,
126
+ "eval_loss": 0.8290878534317017,
127
+ "eval_runtime": 30.401,
128
+ "eval_samples_per_second": 4.474,
129
+ "eval_steps_per_second": 0.164,
130
+ "step": 78
131
+ },
132
+ {
133
+ "epoch": 8.205128205128204,
134
+ "grad_norm": 1.4983110427856445,
135
+ "learning_rate": 4.567901234567901e-05,
136
+ "loss": 0.7135,
137
+ "step": 80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  },
139
  {
140
+ "epoch": 8.923076923076923,
141
+ "eval_accuracy": 0.7941176470588235,
142
+ "eval_loss": 0.7505079507827759,
143
+ "eval_runtime": 31.6046,
144
+ "eval_samples_per_second": 4.303,
145
+ "eval_steps_per_second": 0.158,
146
+ "step": 87
147
  },
148
  {
149
+ "epoch": 9.23076923076923,
150
+ "grad_norm": 3.118570566177368,
151
+ "learning_rate": 4.4444444444444447e-05,
152
+ "loss": 0.6222,
153
+ "step": 90
 
 
 
 
 
 
 
154
  },
155
  {
156
+ "epoch": 9.948717948717949,
157
+ "eval_accuracy": 0.8455882352941176,
158
+ "eval_loss": 0.6433916687965393,
159
+ "eval_runtime": 30.9566,
160
+ "eval_samples_per_second": 4.393,
161
+ "eval_steps_per_second": 0.162,
162
+ "step": 97
163
  },
164
  {
165
+ "epoch": 10.256410256410255,
166
+ "grad_norm": 1.8012874126434326,
167
+ "learning_rate": 4.3209876543209875e-05,
168
+ "loss": 0.5445,
169
+ "step": 100
170
  },
171
  {
172
+ "epoch": 10.974358974358974,
173
+ "eval_accuracy": 0.8529411764705882,
174
+ "eval_loss": 0.5995692610740662,
175
+ "eval_runtime": 31.2634,
176
+ "eval_samples_per_second": 4.35,
177
+ "eval_steps_per_second": 0.16,
178
+ "step": 107
179
  },
180
  {
181
+ "epoch": 11.282051282051283,
182
+ "grad_norm": 2.380070447921753,
183
+ "learning_rate": 4.197530864197531e-05,
184
+ "loss": 0.4935,
185
+ "step": 110
186
  },
187
  {
188
  "epoch": 12.0,
189
+ "eval_accuracy": 0.8529411764705882,
190
+ "eval_loss": 0.5513917207717896,
191
+ "eval_runtime": 30.5663,
192
+ "eval_samples_per_second": 4.449,
193
+ "eval_steps_per_second": 0.164,
194
+ "step": 117
195
+ },
196
+ {
197
+ "epoch": 12.307692307692308,
198
+ "grad_norm": 2.3905367851257324,
199
+ "learning_rate": 4.074074074074074e-05,
200
+ "loss": 0.4131,
201
+ "step": 120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  },
203
  {
204
+ "epoch": 12.923076923076923,
205
+ "eval_accuracy": 0.8602941176470589,
206
+ "eval_loss": 0.5028533339500427,
207
+ "eval_runtime": 29.2998,
208
+ "eval_samples_per_second": 4.642,
209
+ "eval_steps_per_second": 0.171,
210
+ "step": 126
211
  },
212
  {
213
  "epoch": 13.333333333333334,
214
+ "grad_norm": 2.1402792930603027,
215
+ "learning_rate": 3.950617283950617e-05,
216
+ "loss": 0.4012,
217
+ "step": 130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  },
219
  {
220
+ "epoch": 13.948717948717949,
221
+ "eval_accuracy": 0.8382352941176471,
222
+ "eval_loss": 0.5566176176071167,
223
+ "eval_runtime": 31.1116,
224
+ "eval_samples_per_second": 4.371,
225
+ "eval_steps_per_second": 0.161,
226
+ "step": 136
227
  },
228
  {
229
+ "epoch": 14.35897435897436,
230
+ "grad_norm": 2.5639398097991943,
231
+ "learning_rate": 3.82716049382716e-05,
232
+ "loss": 0.3689,
233
+ "step": 140
234
  },
235
  {
236
+ "epoch": 14.974358974358974,
237
+ "eval_accuracy": 0.8382352941176471,
238
+ "eval_loss": 0.5533155798912048,
239
+ "eval_runtime": 29.2502,
240
+ "eval_samples_per_second": 4.65,
241
+ "eval_steps_per_second": 0.171,
242
+ "step": 146
243
  },
244
  {
245
+ "epoch": 15.384615384615385,
246
+ "grad_norm": 3.994143009185791,
247
+ "learning_rate": 3.7037037037037037e-05,
248
+ "loss": 0.3533,
249
+ "step": 150
250
  },
251
  {
252
+ "epoch": 16.0,
253
  "eval_accuracy": 0.8970588235294118,
254
+ "eval_loss": 0.423177033662796,
255
+ "eval_runtime": 29.8411,
256
+ "eval_samples_per_second": 4.557,
257
+ "eval_steps_per_second": 0.168,
258
+ "step": 156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  },
260
  {
261
+ "epoch": 16.41025641025641,
262
+ "grad_norm": 3.873570203781128,
263
+ "learning_rate": 3.580246913580247e-05,
264
+ "loss": 0.2954,
265
+ "step": 160
266
  },
267
  {
268
+ "epoch": 16.923076923076923,
269
+ "eval_accuracy": 0.8897058823529411,
270
+ "eval_loss": 0.4589211940765381,
271
+ "eval_runtime": 29.2531,
272
+ "eval_samples_per_second": 4.649,
273
+ "eval_steps_per_second": 0.171,
274
+ "step": 165
275
  },
276
  {
277
+ "epoch": 17.435897435897434,
278
+ "grad_norm": 2.3892805576324463,
279
+ "learning_rate": 3.45679012345679e-05,
280
+ "loss": 0.2907,
281
+ "step": 170
 
 
282
  },
283
  {
284
+ "epoch": 17.94871794871795,
285
+ "eval_accuracy": 0.8970588235294118,
286
+ "eval_loss": 0.4223209321498871,
287
+ "eval_runtime": 28.9917,
288
+ "eval_samples_per_second": 4.691,
289
+ "eval_steps_per_second": 0.172,
290
+ "step": 175
291
  },
292
  {
293
+ "epoch": 18.46153846153846,
294
+ "grad_norm": 0.7764536142349243,
295
+ "learning_rate": 3.3333333333333335e-05,
296
+ "loss": 0.2804,
297
+ "step": 180
298
  },
299
  {
300
+ "epoch": 18.974358974358974,
301
+ "eval_accuracy": 0.8970588235294118,
302
+ "eval_loss": 0.4056432843208313,
303
+ "eval_runtime": 28.7309,
304
+ "eval_samples_per_second": 4.734,
305
+ "eval_steps_per_second": 0.174,
306
+ "step": 185
307
  },
308
  {
309
+ "epoch": 19.487179487179485,
310
+ "grad_norm": 1.780919075012207,
311
+ "learning_rate": 3.209876543209876e-05,
312
+ "loss": 0.2469,
313
+ "step": 190
314
  },
315
  {
316
+ "epoch": 20.0,
317
  "eval_accuracy": 0.9117647058823529,
318
+ "eval_loss": 0.390433132648468,
319
+ "eval_runtime": 29.1361,
320
+ "eval_samples_per_second": 4.668,
321
+ "eval_steps_per_second": 0.172,
322
+ "step": 195
323
  },
324
  {
325
+ "epoch": 20.51282051282051,
326
+ "grad_norm": 3.051804304122925,
327
+ "learning_rate": 3.08641975308642e-05,
328
+ "loss": 0.2643,
329
+ "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
  },
331
  {
332
+ "epoch": 20.923076923076923,
333
  "eval_accuracy": 0.9044117647058824,
334
+ "eval_loss": 0.38655146956443787,
335
+ "eval_runtime": 28.9046,
336
+ "eval_samples_per_second": 4.705,
337
+ "eval_steps_per_second": 0.173,
338
+ "step": 204
 
 
 
 
 
 
 
339
  },
340
  {
341
+ "epoch": 21.53846153846154,
342
+ "grad_norm": 2.3158023357391357,
343
+ "learning_rate": 2.962962962962963e-05,
344
+ "loss": 0.2212,
345
+ "step": 210
346
  },
347
  {
348
+ "epoch": 21.94871794871795,
349
+ "eval_accuracy": 0.875,
350
+ "eval_loss": 0.4173263609409332,
351
+ "eval_runtime": 29.2277,
352
+ "eval_samples_per_second": 4.653,
353
+ "eval_steps_per_second": 0.171,
354
+ "step": 214
355
  },
356
  {
357
+ "epoch": 22.564102564102566,
358
+ "grad_norm": 3.5362863540649414,
359
+ "learning_rate": 2.839506172839506e-05,
360
+ "loss": 0.2476,
361
+ "step": 220
362
  },
363
  {
364
+ "epoch": 22.974358974358974,
365
+ "eval_accuracy": 0.8014705882352942,
366
+ "eval_loss": 0.6001182794570923,
367
+ "eval_runtime": 29.5734,
368
+ "eval_samples_per_second": 4.599,
369
+ "eval_steps_per_second": 0.169,
370
+ "step": 224
371
  },
372
  {
373
+ "epoch": 23.58974358974359,
374
+ "grad_norm": 2.7392866611480713,
375
+ "learning_rate": 2.7160493827160493e-05,
376
+ "loss": 0.2347,
377
+ "step": 230
378
  },
379
  {
380
+ "epoch": 24.0,
381
+ "eval_accuracy": 0.9044117647058824,
382
+ "eval_loss": 0.39004915952682495,
383
+ "eval_runtime": 28.5909,
384
+ "eval_samples_per_second": 4.757,
385
+ "eval_steps_per_second": 0.175,
386
+ "step": 234
387
  },
388
  {
389
+ "epoch": 24.615384615384617,
390
+ "grad_norm": 1.7161147594451904,
391
+ "learning_rate": 2.5925925925925925e-05,
392
+ "loss": 0.207,
393
+ "step": 240
394
  },
395
  {
396
+ "epoch": 24.923076923076923,
397
+ "eval_accuracy": 0.8897058823529411,
398
+ "eval_loss": 0.40333059430122375,
399
+ "eval_runtime": 29.3709,
400
+ "eval_samples_per_second": 4.63,
401
+ "eval_steps_per_second": 0.17,
402
+ "step": 243
403
  },
404
  {
405
+ "epoch": 25.641025641025642,
406
+ "grad_norm": 3.595083713531494,
407
+ "learning_rate": 2.4691358024691357e-05,
408
+ "loss": 0.1803,
409
+ "step": 250
 
 
410
  },
411
  {
412
+ "epoch": 25.94871794871795,
413
+ "eval_accuracy": 0.9264705882352942,
414
+ "eval_loss": 0.3509926497936249,
415
+ "eval_runtime": 28.9919,
416
+ "eval_samples_per_second": 4.691,
417
+ "eval_steps_per_second": 0.172,
418
+ "step": 253
419
  },
420
  {
421
+ "epoch": 26.666666666666668,
422
+ "grad_norm": 3.6539347171783447,
423
+ "learning_rate": 2.345679012345679e-05,
424
+ "loss": 0.1979,
425
+ "step": 260
426
  },
427
  {
428
+ "epoch": 26.974358974358974,
429
+ "eval_accuracy": 0.9191176470588235,
430
+ "eval_loss": 0.3723020553588867,
431
+ "eval_runtime": 28.5326,
432
+ "eval_samples_per_second": 4.766,
433
+ "eval_steps_per_second": 0.175,
434
+ "step": 263
435
  },
436
  {
437
+ "epoch": 27.692307692307693,
438
+ "grad_norm": 2.836503505706787,
439
+ "learning_rate": 2.2222222222222223e-05,
440
+ "loss": 0.1821,
441
+ "step": 270
442
  },
443
  {
444
+ "epoch": 28.0,
445
  "eval_accuracy": 0.8823529411764706,
446
+ "eval_loss": 0.4319731295108795,
447
+ "eval_runtime": 29.3401,
448
+ "eval_samples_per_second": 4.635,
449
+ "eval_steps_per_second": 0.17,
450
+ "step": 273
 
 
 
 
 
 
 
451
  },
452
  {
453
+ "epoch": 28.71794871794872,
454
+ "grad_norm": 3.2694666385650635,
455
+ "learning_rate": 2.0987654320987655e-05,
456
+ "loss": 0.1992,
457
+ "step": 280
458
  },
459
  {
460
+ "epoch": 28.923076923076923,
461
+ "eval_accuracy": 0.9117647058823529,
462
+ "eval_loss": 0.3557020127773285,
463
+ "eval_runtime": 29.5759,
464
+ "eval_samples_per_second": 4.598,
465
+ "eval_steps_per_second": 0.169,
466
+ "step": 282
467
  },
468
  {
469
+ "epoch": 29.743589743589745,
470
+ "grad_norm": 3.0022389888763428,
471
+ "learning_rate": 1.9753086419753087e-05,
472
+ "loss": 0.2154,
473
+ "step": 290
474
  },
475
  {
476
+ "epoch": 29.94871794871795,
477
+ "eval_accuracy": 0.9191176470588235,
478
+ "eval_loss": 0.3362026810646057,
479
+ "eval_runtime": 29.8144,
480
+ "eval_samples_per_second": 4.562,
481
+ "eval_steps_per_second": 0.168,
482
+ "step": 292
483
  },
484
  {
485
+ "epoch": 30.76923076923077,
486
+ "grad_norm": 1.9505140781402588,
487
+ "learning_rate": 1.8518518518518518e-05,
488
+ "loss": 0.1801,
489
+ "step": 300
490
  },
491
  {
492
+ "epoch": 30.974358974358974,
493
+ "eval_accuracy": 0.875,
494
+ "eval_loss": 0.4357524812221527,
495
+ "eval_runtime": 28.9816,
496
+ "eval_samples_per_second": 4.693,
497
+ "eval_steps_per_second": 0.173,
498
+ "step": 302
499
  },
500
  {
501
+ "epoch": 31.794871794871796,
502
+ "grad_norm": 2.702116012573242,
503
+ "learning_rate": 1.728395061728395e-05,
504
+ "loss": 0.1794,
505
+ "step": 310
506
  },
507
  {
508
+ "epoch": 32.0,
509
  "eval_accuracy": 0.9191176470588235,
510
+ "eval_loss": 0.3500049114227295,
511
+ "eval_runtime": 29.5602,
512
+ "eval_samples_per_second": 4.601,
513
+ "eval_steps_per_second": 0.169,
514
+ "step": 312
 
 
 
 
 
 
 
515
  },
516
  {
517
+ "epoch": 32.82051282051282,
518
+ "grad_norm": 1.1489578485488892,
519
+ "learning_rate": 1.604938271604938e-05,
520
+ "loss": 0.1566,
521
+ "step": 320
 
 
 
 
 
 
 
 
 
 
 
 
 
 
522
  },
523
  {
524
+ "epoch": 32.92307692307692,
525
  "eval_accuracy": 0.9264705882352942,
526
+ "eval_loss": 0.30455005168914795,
527
+ "eval_runtime": 29.3612,
528
+ "eval_samples_per_second": 4.632,
529
+ "eval_steps_per_second": 0.17,
530
+ "step": 321
531
  },
532
  {
533
+ "epoch": 33.84615384615385,
534
+ "grad_norm": 1.6699856519699097,
535
+ "learning_rate": 1.4814814814814815e-05,
536
+ "loss": 0.1432,
537
+ "step": 330
538
  },
539
  {
540
+ "epoch": 33.94871794871795,
541
+ "eval_accuracy": 0.9264705882352942,
542
+ "eval_loss": 0.3239189088344574,
543
+ "eval_runtime": 29.5627,
544
+ "eval_samples_per_second": 4.6,
545
+ "eval_steps_per_second": 0.169,
546
+ "step": 331
547
  },
548
  {
549
+ "epoch": 34.87179487179487,
550
+ "grad_norm": 1.4475438594818115,
551
+ "learning_rate": 1.3580246913580247e-05,
552
+ "loss": 0.145,
553
+ "step": 340
554
  },
555
  {
556
+ "epoch": 34.97435897435897,
557
+ "eval_accuracy": 0.9338235294117647,
558
+ "eval_loss": 0.33108076453208923,
559
+ "eval_runtime": 29.2908,
560
+ "eval_samples_per_second": 4.643,
561
+ "eval_steps_per_second": 0.171,
562
+ "step": 341
563
  },
564
  {
565
+ "epoch": 35.8974358974359,
566
+ "grad_norm": 1.5676722526550293,
567
+ "learning_rate": 1.2345679012345678e-05,
568
+ "loss": 0.1578,
569
+ "step": 350
 
 
570
  },
571
  {
572
+ "epoch": 36.0,
573
+ "eval_accuracy": 0.9338235294117647,
574
+ "eval_loss": 0.30287855863571167,
575
+ "eval_runtime": 29.0519,
576
+ "eval_samples_per_second": 4.681,
577
+ "eval_steps_per_second": 0.172,
578
+ "step": 351
579
  },
580
  {
581
+ "epoch": 36.92307692307692,
582
+ "grad_norm": 1.0356682538986206,
583
+ "learning_rate": 1.1111111111111112e-05,
584
+ "loss": 0.1511,
585
+ "step": 360
586
  },
587
  {
588
+ "epoch": 36.92307692307692,
589
+ "eval_accuracy": 0.9338235294117647,
590
+ "eval_loss": 0.30101922154426575,
591
+ "eval_runtime": 31.6784,
592
+ "eval_samples_per_second": 4.293,
593
+ "eval_steps_per_second": 0.158,
594
+ "step": 360
595
  },
596
  {
597
+ "epoch": 37.94871794871795,
598
+ "grad_norm": 3.916630268096924,
599
+ "learning_rate": 9.876543209876543e-06,
600
+ "loss": 0.139,
601
+ "step": 370
602
  },
603
  {
604
+ "epoch": 37.94871794871795,
605
  "eval_accuracy": 0.9264705882352942,
606
+ "eval_loss": 0.29820650815963745,
607
+ "eval_runtime": 29.7228,
608
+ "eval_samples_per_second": 4.576,
609
+ "eval_steps_per_second": 0.168,
610
+ "step": 370
 
 
 
 
 
 
 
611
  },
612
  {
613
+ "epoch": 38.97435897435897,
614
+ "grad_norm": 0.7166056632995605,
615
+ "learning_rate": 8.641975308641975e-06,
616
+ "loss": 0.1294,
617
+ "step": 380
618
  },
619
  {
620
+ "epoch": 38.97435897435897,
621
+ "eval_accuracy": 0.9191176470588235,
622
+ "eval_loss": 0.32610374689102173,
623
+ "eval_runtime": 28.752,
624
+ "eval_samples_per_second": 4.73,
625
+ "eval_steps_per_second": 0.174,
626
+ "step": 380
627
  },
628
  {
629
+ "epoch": 40.0,
630
+ "grad_norm": 2.002183675765991,
631
+ "learning_rate": 7.4074074074074075e-06,
632
+ "loss": 0.1263,
633
+ "step": 390
634
  },
635
  {
636
+ "epoch": 40.0,
637
+ "eval_accuracy": 0.9338235294117647,
638
+ "eval_loss": 0.2932202219963074,
639
+ "eval_runtime": 29.503,
640
+ "eval_samples_per_second": 4.61,
641
+ "eval_steps_per_second": 0.169,
642
+ "step": 390
643
  },
644
  {
645
+ "epoch": 40.92307692307692,
646
+ "eval_accuracy": 0.9338235294117647,
647
+ "eval_loss": 0.2943984270095825,
648
+ "eval_runtime": 29.1815,
649
+ "eval_samples_per_second": 4.66,
650
+ "eval_steps_per_second": 0.171,
651
+ "step": 399
652
  },
653
  {
654
+ "epoch": 41.02564102564103,
655
+ "grad_norm": 0.8789101839065552,
656
+ "learning_rate": 6.172839506172839e-06,
657
+ "loss": 0.1216,
658
+ "step": 400
659
  },
660
  {
661
+ "epoch": 41.94871794871795,
662
+ "eval_accuracy": 0.9338235294117647,
663
+ "eval_loss": 0.2867204248905182,
664
+ "eval_runtime": 28.6349,
665
+ "eval_samples_per_second": 4.749,
666
+ "eval_steps_per_second": 0.175,
667
+ "step": 409
668
  },
669
  {
670
+ "epoch": 42.05128205128205,
671
+ "grad_norm": 2.1831233501434326,
672
+ "learning_rate": 4.938271604938272e-06,
673
+ "loss": 0.1199,
674
+ "step": 410
675
  },
676
  {
677
+ "epoch": 42.97435897435897,
678
+ "eval_accuracy": 0.9338235294117647,
679
+ "eval_loss": 0.28869205713272095,
680
+ "eval_runtime": 28.6692,
681
+ "eval_samples_per_second": 4.744,
682
+ "eval_steps_per_second": 0.174,
683
+ "step": 419
684
  },
685
  {
686
+ "epoch": 43.07692307692308,
687
+ "grad_norm": 3.3434555530548096,
688
+ "learning_rate": 3.7037037037037037e-06,
689
+ "loss": 0.128,
690
+ "step": 420
691
  },
692
  {
693
+ "epoch": 44.0,
694
+ "eval_accuracy": 0.9338235294117647,
695
+ "eval_loss": 0.28250837326049805,
696
+ "eval_runtime": 28.5349,
697
+ "eval_samples_per_second": 4.766,
698
+ "eval_steps_per_second": 0.175,
699
+ "step": 429
700
  },
701
  {
702
+ "epoch": 44.1025641025641,
703
+ "grad_norm": 1.314479112625122,
704
+ "learning_rate": 2.469135802469136e-06,
705
+ "loss": 0.1115,
706
+ "step": 430
707
  },
708
  {
709
+ "epoch": 44.92307692307692,
710
+ "eval_accuracy": 0.9338235294117647,
711
+ "eval_loss": 0.2879542410373688,
712
+ "eval_runtime": 28.8609,
713
+ "eval_samples_per_second": 4.712,
714
+ "eval_steps_per_second": 0.173,
715
+ "step": 438
716
  },
717
  {
718
+ "epoch": 45.12820512820513,
719
+ "grad_norm": 1.4549723863601685,
720
+ "learning_rate": 1.234567901234568e-06,
721
+ "loss": 0.1179,
722
+ "step": 440
723
  },
724
  {
725
+ "epoch": 45.94871794871795,
726
+ "eval_accuracy": 0.9338235294117647,
727
+ "eval_loss": 0.2870730757713318,
728
+ "eval_runtime": 29.6828,
729
+ "eval_samples_per_second": 4.582,
730
+ "eval_steps_per_second": 0.168,
731
+ "step": 448
732
  },
733
  {
734
+ "epoch": 46.15384615384615,
735
+ "grad_norm": 1.243077039718628,
736
+ "learning_rate": 0.0,
737
+ "loss": 0.12,
738
+ "step": 450
739
  },
740
  {
741
+ "epoch": 46.15384615384615,
742
+ "eval_accuracy": 0.9338235294117647,
743
+ "eval_loss": 0.28695058822631836,
744
+ "eval_runtime": 31.6423,
745
+ "eval_samples_per_second": 4.298,
746
+ "eval_steps_per_second": 0.158,
747
+ "step": 450
748
  },
749
  {
750
+ "epoch": 46.15384615384615,
751
+ "step": 450,
752
+ "total_flos": 4.3781443993328026e+18,
753
+ "train_loss": 0.41361336602105037,
754
+ "train_runtime": 14457.8356,
755
+ "train_samples_per_second": 4.233,
756
+ "train_steps_per_second": 0.031
 
 
 
 
 
 
 
 
 
757
  }
758
  ],
759
  "logging_steps": 10,
760
+ "max_steps": 450,
761
  "num_input_tokens_seen": 0,
762
+ "num_train_epochs": 50,
763
  "save_steps": 500,
764
  "stateful_callbacks": {
765
  "TrainerControl": {
 
773
  "attributes": {}
774
  }
775
  },
776
+ "total_flos": 4.3781443993328026e+18,
777
+ "train_batch_size": 32,
778
  "trial_name": null,
779
  "trial_params": null
780
  }