NiharGupte commited on
Commit
19052c9
1 Parent(s): aefe6b5

Training in progress, epoch 0

Browse files
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 20.0,
3
- "total_flos": 6.302667737382912e+17,
4
- "train_loss": 0.13929352825309368,
5
- "train_runtime": 703.6937,
6
- "train_samples_per_second": 42.177,
7
- "train_steps_per_second": 1.336
8
  }
 
1
  {
2
+ "epoch": 5.0,
3
+ "total_flos": 1.575666934345728e+17,
4
+ "train_loss": 0.0,
5
+ "train_runtime": 166.8151,
6
+ "train_samples_per_second": 44.48,
7
+ "train_steps_per_second": 1.409
8
  }
runs/May04_07-56-14_4f22111e1b44/events.out.tfevents.1714809377.4f22111e1b44.9006.4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b949d6e096ca1889d9c7c6208c2aed3e3e8d992851574d4caf15ca1287fad7c2
3
+ size 5370
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 20.0,
3
- "total_flos": 6.302667737382912e+17,
4
- "train_loss": 0.13929352825309368,
5
- "train_runtime": 703.6937,
6
- "train_samples_per_second": 42.177,
7
- "train_steps_per_second": 1.336
8
  }
 
1
  {
2
+ "epoch": 5.0,
3
+ "total_flos": 1.575666934345728e+17,
4
+ "train_loss": 0.0,
5
+ "train_runtime": 166.8151,
6
+ "train_samples_per_second": 44.48,
7
+ "train_steps_per_second": 1.409
8
  }
trainer_state.json CHANGED
@@ -1,867 +1,235 @@
1
  {
2
- "best_metric": 1.0,
3
- "best_model_checkpoint": "resnet-50-finetuned-student_kaggle/checkpoint-423",
4
- "epoch": 20.0,
5
  "eval_steps": 500,
6
- "global_step": 940,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.2127659574468085,
13
- "grad_norm": 54.35947799682617,
14
- "learning_rate": 5.319148936170213e-06,
15
- "loss": 0.9341,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.425531914893617,
20
- "grad_norm": 34.59556579589844,
21
- "learning_rate": 1.0638297872340426e-05,
22
- "loss": 0.9157,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.6382978723404256,
27
- "grad_norm": 42.179847717285156,
28
- "learning_rate": 1.595744680851064e-05,
29
- "loss": 0.7801,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.851063829787234,
34
- "grad_norm": 30.099029541015625,
35
- "learning_rate": 2.1276595744680852e-05,
36
- "loss": 0.7142,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 1.0,
41
- "eval_accuracy": 0.610062893081761,
42
- "eval_loss": 0.6418222188949585,
43
- "eval_runtime": 7.6299,
44
- "eval_samples_per_second": 83.356,
45
- "eval_steps_per_second": 2.621,
46
  "step": 47
47
  },
48
  {
49
  "epoch": 1.0638297872340425,
50
- "grad_norm": 48.046546936035156,
51
- "learning_rate": 2.6595744680851064e-05,
52
- "loss": 0.7114,
53
  "step": 50
54
  },
55
  {
56
  "epoch": 1.2765957446808511,
57
- "grad_norm": 45.94879913330078,
58
- "learning_rate": 3.191489361702128e-05,
59
- "loss": 0.6014,
60
  "step": 60
61
  },
62
  {
63
  "epoch": 1.4893617021276595,
64
- "grad_norm": 17.69209861755371,
65
- "learning_rate": 3.723404255319149e-05,
66
- "loss": 0.4815,
67
  "step": 70
68
  },
69
  {
70
  "epoch": 1.702127659574468,
71
- "grad_norm": 18.821670532226562,
72
- "learning_rate": 4.2553191489361704e-05,
73
- "loss": 0.463,
74
  "step": 80
75
  },
76
  {
77
  "epoch": 1.9148936170212765,
78
- "grad_norm": 23.751588821411133,
79
- "learning_rate": 4.787234042553192e-05,
80
- "loss": 0.3351,
81
  "step": 90
82
  },
83
  {
84
  "epoch": 2.0,
85
- "eval_accuracy": 0.8946540880503144,
86
- "eval_loss": 0.25965991616249084,
87
- "eval_runtime": 7.6659,
88
- "eval_samples_per_second": 82.964,
89
- "eval_steps_per_second": 2.609,
90
  "step": 94
91
  },
92
  {
93
  "epoch": 2.127659574468085,
94
- "grad_norm": 18.11069679260254,
95
- "learning_rate": 4.964539007092199e-05,
96
- "loss": 0.3193,
97
  "step": 100
98
  },
99
  {
100
  "epoch": 2.3404255319148937,
101
- "grad_norm": 12.397391319274902,
102
- "learning_rate": 4.905437352245863e-05,
103
- "loss": 0.2768,
104
  "step": 110
105
  },
106
  {
107
  "epoch": 2.5531914893617023,
108
- "grad_norm": 16.857635498046875,
109
- "learning_rate": 4.846335697399527e-05,
110
- "loss": 0.2594,
111
  "step": 120
112
  },
113
  {
114
  "epoch": 2.7659574468085104,
115
- "grad_norm": 12.635449409484863,
116
- "learning_rate": 4.787234042553192e-05,
117
- "loss": 0.2063,
118
  "step": 130
119
  },
120
  {
121
  "epoch": 2.978723404255319,
122
- "grad_norm": 15.277303695678711,
123
- "learning_rate": 4.728132387706856e-05,
124
- "loss": 0.2574,
125
  "step": 140
126
  },
127
  {
128
  "epoch": 3.0,
129
- "eval_accuracy": 0.9779874213836478,
130
- "eval_loss": 0.10460298508405685,
131
- "eval_runtime": 8.3391,
132
- "eval_samples_per_second": 76.267,
133
- "eval_steps_per_second": 2.398,
134
  "step": 141
135
  },
136
  {
137
  "epoch": 3.1914893617021276,
138
- "grad_norm": 14.497098922729492,
139
- "learning_rate": 4.669030732860521e-05,
140
- "loss": 0.2349,
141
  "step": 150
142
  },
143
  {
144
  "epoch": 3.404255319148936,
145
- "grad_norm": 17.647092819213867,
146
- "learning_rate": 4.609929078014185e-05,
147
- "loss": 0.1631,
148
  "step": 160
149
  },
150
  {
151
  "epoch": 3.617021276595745,
152
- "grad_norm": 12.856146812438965,
153
- "learning_rate": 4.550827423167849e-05,
154
- "loss": 0.1675,
155
  "step": 170
156
  },
157
  {
158
  "epoch": 3.829787234042553,
159
- "grad_norm": 7.248583793640137,
160
- "learning_rate": 4.491725768321513e-05,
161
- "loss": 0.1479,
162
  "step": 180
163
  },
164
  {
165
  "epoch": 4.0,
166
- "eval_accuracy": 0.9874213836477987,
167
- "eval_loss": 0.061614990234375,
168
- "eval_runtime": 8.4097,
169
- "eval_samples_per_second": 75.627,
170
- "eval_steps_per_second": 2.378,
171
  "step": 188
172
  },
173
  {
174
  "epoch": 4.042553191489362,
175
- "grad_norm": 25.721847534179688,
176
- "learning_rate": 4.432624113475177e-05,
177
- "loss": 0.1528,
178
  "step": 190
179
  },
180
  {
181
  "epoch": 4.25531914893617,
182
- "grad_norm": 6.252942085266113,
183
- "learning_rate": 4.373522458628842e-05,
184
- "loss": 0.145,
185
  "step": 200
186
  },
187
  {
188
  "epoch": 4.468085106382979,
189
- "grad_norm": 6.672601222991943,
190
- "learning_rate": 4.3144208037825064e-05,
191
- "loss": 0.1247,
192
  "step": 210
193
  },
194
  {
195
  "epoch": 4.680851063829787,
196
- "grad_norm": 20.4866886138916,
197
- "learning_rate": 4.2553191489361704e-05,
198
- "loss": 0.1405,
199
  "step": 220
200
  },
201
  {
202
  "epoch": 4.8936170212765955,
203
- "grad_norm": 19.644893646240234,
204
- "learning_rate": 4.1962174940898345e-05,
205
- "loss": 0.1284,
206
  "step": 230
207
  },
208
  {
209
  "epoch": 5.0,
210
- "eval_accuracy": 0.9952830188679245,
211
- "eval_loss": 0.02317511849105358,
212
- "eval_runtime": 10.3441,
213
- "eval_samples_per_second": 61.485,
214
- "eval_steps_per_second": 1.933,
215
  "step": 235
216
  },
217
  {
218
- "epoch": 5.1063829787234045,
219
- "grad_norm": 10.905556678771973,
220
- "learning_rate": 4.1371158392434986e-05,
221
- "loss": 0.1178,
222
- "step": 240
223
- },
224
- {
225
- "epoch": 5.319148936170213,
226
- "grad_norm": 11.02078628540039,
227
- "learning_rate": 4.078014184397163e-05,
228
- "loss": 0.1176,
229
- "step": 250
230
- },
231
- {
232
- "epoch": 5.531914893617021,
233
- "grad_norm": 7.510810375213623,
234
- "learning_rate": 4.018912529550828e-05,
235
- "loss": 0.0881,
236
- "step": 260
237
- },
238
- {
239
- "epoch": 5.74468085106383,
240
- "grad_norm": 2.0541610717773438,
241
- "learning_rate": 3.959810874704492e-05,
242
- "loss": 0.1274,
243
- "step": 270
244
- },
245
- {
246
- "epoch": 5.957446808510638,
247
- "grad_norm": 7.680713653564453,
248
- "learning_rate": 3.900709219858156e-05,
249
- "loss": 0.077,
250
- "step": 280
251
- },
252
- {
253
- "epoch": 6.0,
254
- "eval_accuracy": 0.9952830188679245,
255
- "eval_loss": 0.015012426301836967,
256
- "eval_runtime": 8.4118,
257
- "eval_samples_per_second": 75.608,
258
- "eval_steps_per_second": 2.378,
259
- "step": 282
260
- },
261
- {
262
- "epoch": 6.170212765957447,
263
- "grad_norm": 8.117420196533203,
264
- "learning_rate": 3.84160756501182e-05,
265
- "loss": 0.172,
266
- "step": 290
267
- },
268
- {
269
- "epoch": 6.382978723404255,
270
- "grad_norm": 23.871868133544922,
271
- "learning_rate": 3.782505910165485e-05,
272
- "loss": 0.0613,
273
- "step": 300
274
- },
275
- {
276
- "epoch": 6.595744680851064,
277
- "grad_norm": 15.407998085021973,
278
- "learning_rate": 3.723404255319149e-05,
279
- "loss": 0.1287,
280
- "step": 310
281
- },
282
- {
283
- "epoch": 6.808510638297872,
284
- "grad_norm": 6.940992832183838,
285
- "learning_rate": 3.664302600472813e-05,
286
- "loss": 0.103,
287
- "step": 320
288
- },
289
- {
290
- "epoch": 7.0,
291
- "eval_accuracy": 0.9984276729559748,
292
- "eval_loss": 0.010532047599554062,
293
- "eval_runtime": 7.9689,
294
- "eval_samples_per_second": 79.81,
295
- "eval_steps_per_second": 2.51,
296
- "step": 329
297
- },
298
- {
299
- "epoch": 7.0212765957446805,
300
- "grad_norm": 4.598968029022217,
301
- "learning_rate": 3.605200945626478e-05,
302
- "loss": 0.0906,
303
- "step": 330
304
- },
305
- {
306
- "epoch": 7.23404255319149,
307
- "grad_norm": 7.53684663772583,
308
- "learning_rate": 3.546099290780142e-05,
309
- "loss": 0.0792,
310
- "step": 340
311
- },
312
- {
313
- "epoch": 7.446808510638298,
314
- "grad_norm": 2.750072479248047,
315
- "learning_rate": 3.4869976359338065e-05,
316
- "loss": 0.091,
317
- "step": 350
318
- },
319
- {
320
- "epoch": 7.659574468085106,
321
- "grad_norm": 4.067008018493652,
322
- "learning_rate": 3.4278959810874706e-05,
323
- "loss": 0.0777,
324
- "step": 360
325
- },
326
- {
327
- "epoch": 7.872340425531915,
328
- "grad_norm": 15.093037605285645,
329
- "learning_rate": 3.3687943262411347e-05,
330
- "loss": 0.0922,
331
- "step": 370
332
- },
333
- {
334
- "epoch": 8.0,
335
- "eval_accuracy": 0.9984276729559748,
336
- "eval_loss": 0.009353628382086754,
337
- "eval_runtime": 7.7264,
338
- "eval_samples_per_second": 82.315,
339
- "eval_steps_per_second": 2.589,
340
- "step": 376
341
- },
342
- {
343
- "epoch": 8.085106382978724,
344
- "grad_norm": 8.675619125366211,
345
- "learning_rate": 3.309692671394799e-05,
346
- "loss": 0.1211,
347
- "step": 380
348
- },
349
- {
350
- "epoch": 8.297872340425531,
351
- "grad_norm": 5.723608493804932,
352
- "learning_rate": 3.2505910165484634e-05,
353
- "loss": 0.0645,
354
- "step": 390
355
- },
356
- {
357
- "epoch": 8.51063829787234,
358
- "grad_norm": 8.031245231628418,
359
- "learning_rate": 3.191489361702128e-05,
360
- "loss": 0.0787,
361
- "step": 400
362
- },
363
- {
364
- "epoch": 8.72340425531915,
365
- "grad_norm": 2.483238935470581,
366
- "learning_rate": 3.132387706855792e-05,
367
- "loss": 0.0683,
368
- "step": 410
369
- },
370
- {
371
- "epoch": 8.936170212765958,
372
- "grad_norm": 7.612273216247559,
373
- "learning_rate": 3.073286052009456e-05,
374
- "loss": 0.08,
375
- "step": 420
376
- },
377
- {
378
- "epoch": 9.0,
379
- "eval_accuracy": 1.0,
380
- "eval_loss": 0.00555912172421813,
381
- "eval_runtime": 8.4427,
382
- "eval_samples_per_second": 75.331,
383
- "eval_steps_per_second": 2.369,
384
- "step": 423
385
- },
386
- {
387
- "epoch": 9.148936170212766,
388
- "grad_norm": 2.0842654705047607,
389
- "learning_rate": 3.0141843971631207e-05,
390
- "loss": 0.0812,
391
- "step": 430
392
- },
393
- {
394
- "epoch": 9.361702127659575,
395
- "grad_norm": 17.500883102416992,
396
- "learning_rate": 2.9550827423167847e-05,
397
- "loss": 0.0688,
398
- "step": 440
399
- },
400
- {
401
- "epoch": 9.574468085106384,
402
- "grad_norm": 9.671988487243652,
403
- "learning_rate": 2.895981087470449e-05,
404
- "loss": 0.0837,
405
- "step": 450
406
- },
407
- {
408
- "epoch": 9.787234042553191,
409
- "grad_norm": 13.061412811279297,
410
- "learning_rate": 2.836879432624114e-05,
411
- "loss": 0.1024,
412
- "step": 460
413
- },
414
- {
415
- "epoch": 10.0,
416
- "grad_norm": 11.679757118225098,
417
- "learning_rate": 2.777777777777778e-05,
418
- "loss": 0.0492,
419
- "step": 470
420
- },
421
- {
422
- "epoch": 10.0,
423
- "eval_accuracy": 1.0,
424
- "eval_loss": 0.004496446345001459,
425
- "eval_runtime": 9.0183,
426
- "eval_samples_per_second": 70.523,
427
- "eval_steps_per_second": 2.218,
428
- "step": 470
429
- },
430
- {
431
- "epoch": 10.212765957446809,
432
- "grad_norm": 3.253716230392456,
433
- "learning_rate": 2.7186761229314423e-05,
434
- "loss": 0.0495,
435
- "step": 480
436
- },
437
- {
438
- "epoch": 10.425531914893616,
439
- "grad_norm": 5.826026916503906,
440
- "learning_rate": 2.6595744680851064e-05,
441
- "loss": 0.0266,
442
- "step": 490
443
- },
444
- {
445
- "epoch": 10.638297872340425,
446
- "grad_norm": 9.738965034484863,
447
- "learning_rate": 2.6004728132387708e-05,
448
- "loss": 0.0739,
449
- "step": 500
450
- },
451
- {
452
- "epoch": 10.851063829787234,
453
- "grad_norm": 9.04916763305664,
454
- "learning_rate": 2.5413711583924348e-05,
455
- "loss": 0.0574,
456
- "step": 510
457
- },
458
- {
459
- "epoch": 11.0,
460
- "eval_accuracy": 1.0,
461
- "eval_loss": 0.004308629781007767,
462
- "eval_runtime": 8.4135,
463
- "eval_samples_per_second": 75.593,
464
- "eval_steps_per_second": 2.377,
465
- "step": 517
466
- },
467
- {
468
- "epoch": 11.063829787234043,
469
- "grad_norm": 11.041916847229004,
470
- "learning_rate": 2.4822695035460995e-05,
471
- "loss": 0.0977,
472
- "step": 520
473
- },
474
- {
475
- "epoch": 11.27659574468085,
476
- "grad_norm": 31.96723175048828,
477
- "learning_rate": 2.4231678486997636e-05,
478
- "loss": 0.0541,
479
- "step": 530
480
- },
481
- {
482
- "epoch": 11.48936170212766,
483
- "grad_norm": 2.877957344055176,
484
- "learning_rate": 2.364066193853428e-05,
485
- "loss": 0.0401,
486
- "step": 540
487
- },
488
- {
489
- "epoch": 11.702127659574469,
490
- "grad_norm": 3.020596742630005,
491
- "learning_rate": 2.3049645390070924e-05,
492
- "loss": 0.0284,
493
- "step": 550
494
- },
495
- {
496
- "epoch": 11.914893617021276,
497
- "grad_norm": 7.811126232147217,
498
- "learning_rate": 2.2458628841607564e-05,
499
- "loss": 0.0382,
500
- "step": 560
501
- },
502
- {
503
- "epoch": 12.0,
504
- "eval_accuracy": 1.0,
505
- "eval_loss": 0.0022806336637586355,
506
- "eval_runtime": 7.7232,
507
- "eval_samples_per_second": 82.35,
508
- "eval_steps_per_second": 2.59,
509
- "step": 564
510
- },
511
- {
512
- "epoch": 12.127659574468085,
513
- "grad_norm": 11.633199691772461,
514
- "learning_rate": 2.186761229314421e-05,
515
- "loss": 0.0589,
516
- "step": 570
517
- },
518
- {
519
- "epoch": 12.340425531914894,
520
- "grad_norm": 0.3253759443759918,
521
- "learning_rate": 2.1276595744680852e-05,
522
- "loss": 0.041,
523
- "step": 580
524
- },
525
- {
526
- "epoch": 12.553191489361701,
527
- "grad_norm": 2.2859044075012207,
528
- "learning_rate": 2.0685579196217493e-05,
529
- "loss": 0.0578,
530
- "step": 590
531
- },
532
- {
533
- "epoch": 12.76595744680851,
534
- "grad_norm": 1.5324259996414185,
535
- "learning_rate": 2.009456264775414e-05,
536
- "loss": 0.0312,
537
- "step": 600
538
- },
539
- {
540
- "epoch": 12.97872340425532,
541
- "grad_norm": 6.985143661499023,
542
- "learning_rate": 1.950354609929078e-05,
543
- "loss": 0.0666,
544
- "step": 610
545
- },
546
- {
547
- "epoch": 13.0,
548
- "eval_accuracy": 1.0,
549
- "eval_loss": 0.0022491966374218464,
550
- "eval_runtime": 8.1081,
551
- "eval_samples_per_second": 78.44,
552
- "eval_steps_per_second": 2.467,
553
- "step": 611
554
- },
555
- {
556
- "epoch": 13.191489361702128,
557
- "grad_norm": 8.847366333007812,
558
- "learning_rate": 1.8912529550827425e-05,
559
- "loss": 0.0539,
560
- "step": 620
561
- },
562
- {
563
- "epoch": 13.404255319148936,
564
- "grad_norm": 10.476814270019531,
565
- "learning_rate": 1.8321513002364065e-05,
566
- "loss": 0.0369,
567
- "step": 630
568
- },
569
- {
570
- "epoch": 13.617021276595745,
571
- "grad_norm": 5.339621067047119,
572
- "learning_rate": 1.773049645390071e-05,
573
- "loss": 0.0308,
574
- "step": 640
575
- },
576
- {
577
- "epoch": 13.829787234042554,
578
- "grad_norm": 14.648975372314453,
579
- "learning_rate": 1.7139479905437353e-05,
580
- "loss": 0.0477,
581
- "step": 650
582
- },
583
- {
584
- "epoch": 14.0,
585
- "eval_accuracy": 1.0,
586
- "eval_loss": 0.0021932125091552734,
587
- "eval_runtime": 8.4493,
588
- "eval_samples_per_second": 75.272,
589
- "eval_steps_per_second": 2.367,
590
- "step": 658
591
- },
592
- {
593
- "epoch": 14.042553191489361,
594
- "grad_norm": 5.510159969329834,
595
- "learning_rate": 1.6548463356973994e-05,
596
- "loss": 0.028,
597
- "step": 660
598
- },
599
- {
600
- "epoch": 14.25531914893617,
601
- "grad_norm": 5.803068161010742,
602
- "learning_rate": 1.595744680851064e-05,
603
- "loss": 0.0522,
604
- "step": 670
605
- },
606
- {
607
- "epoch": 14.46808510638298,
608
- "grad_norm": 1.1623107194900513,
609
- "learning_rate": 1.536643026004728e-05,
610
- "loss": 0.0481,
611
- "step": 680
612
- },
613
- {
614
- "epoch": 14.680851063829786,
615
- "grad_norm": 12.495600700378418,
616
- "learning_rate": 1.4775413711583924e-05,
617
- "loss": 0.0588,
618
- "step": 690
619
- },
620
- {
621
- "epoch": 14.893617021276595,
622
- "grad_norm": 3.4236888885498047,
623
- "learning_rate": 1.418439716312057e-05,
624
- "loss": 0.0614,
625
- "step": 700
626
- },
627
- {
628
- "epoch": 15.0,
629
- "eval_accuracy": 1.0,
630
- "eval_loss": 0.002270177938044071,
631
- "eval_runtime": 8.5563,
632
- "eval_samples_per_second": 74.331,
633
- "eval_steps_per_second": 2.337,
634
- "step": 705
635
- },
636
- {
637
- "epoch": 15.106382978723405,
638
- "grad_norm": 11.681058883666992,
639
- "learning_rate": 1.3593380614657212e-05,
640
- "loss": 0.0674,
641
- "step": 710
642
- },
643
- {
644
- "epoch": 15.319148936170214,
645
- "grad_norm": 1.846946120262146,
646
- "learning_rate": 1.3002364066193854e-05,
647
- "loss": 0.0415,
648
- "step": 720
649
- },
650
- {
651
- "epoch": 15.53191489361702,
652
- "grad_norm": 8.939858436584473,
653
- "learning_rate": 1.2411347517730498e-05,
654
- "loss": 0.0189,
655
- "step": 730
656
- },
657
- {
658
- "epoch": 15.74468085106383,
659
- "grad_norm": 3.521784782409668,
660
- "learning_rate": 1.182033096926714e-05,
661
- "loss": 0.0585,
662
- "step": 740
663
- },
664
- {
665
- "epoch": 15.957446808510639,
666
- "grad_norm": 1.9891993999481201,
667
- "learning_rate": 1.1229314420803782e-05,
668
- "loss": 0.0282,
669
- "step": 750
670
- },
671
- {
672
- "epoch": 16.0,
673
- "eval_accuracy": 1.0,
674
- "eval_loss": 0.0013930280692875385,
675
- "eval_runtime": 8.1789,
676
- "eval_samples_per_second": 77.761,
677
- "eval_steps_per_second": 2.445,
678
- "step": 752
679
- },
680
- {
681
- "epoch": 16.170212765957448,
682
- "grad_norm": 7.0705246925354,
683
- "learning_rate": 1.0638297872340426e-05,
684
- "loss": 0.0508,
685
- "step": 760
686
- },
687
- {
688
- "epoch": 16.382978723404257,
689
- "grad_norm": 11.365514755249023,
690
- "learning_rate": 1.004728132387707e-05,
691
- "loss": 0.0393,
692
- "step": 770
693
- },
694
- {
695
- "epoch": 16.595744680851062,
696
- "grad_norm": 8.82397747039795,
697
- "learning_rate": 9.456264775413712e-06,
698
- "loss": 0.0509,
699
- "step": 780
700
- },
701
- {
702
- "epoch": 16.80851063829787,
703
- "grad_norm": 5.013731002807617,
704
- "learning_rate": 8.865248226950355e-06,
705
- "loss": 0.0659,
706
- "step": 790
707
- },
708
- {
709
- "epoch": 17.0,
710
- "eval_accuracy": 1.0,
711
- "eval_loss": 0.0016287014586851,
712
- "eval_runtime": 7.6703,
713
- "eval_samples_per_second": 82.917,
714
- "eval_steps_per_second": 2.607,
715
- "step": 799
716
- },
717
- {
718
- "epoch": 17.02127659574468,
719
- "grad_norm": 2.8596644401550293,
720
- "learning_rate": 8.274231678486997e-06,
721
- "loss": 0.0285,
722
- "step": 800
723
- },
724
- {
725
- "epoch": 17.23404255319149,
726
- "grad_norm": 10.184608459472656,
727
- "learning_rate": 7.68321513002364e-06,
728
- "loss": 0.062,
729
- "step": 810
730
- },
731
- {
732
- "epoch": 17.4468085106383,
733
- "grad_norm": 6.029819011688232,
734
- "learning_rate": 7.092198581560285e-06,
735
- "loss": 0.0672,
736
- "step": 820
737
- },
738
- {
739
- "epoch": 17.659574468085108,
740
- "grad_norm": 0.9212875366210938,
741
- "learning_rate": 6.501182033096927e-06,
742
- "loss": 0.0404,
743
- "step": 830
744
- },
745
- {
746
- "epoch": 17.872340425531917,
747
- "grad_norm": 0.5147794485092163,
748
- "learning_rate": 5.91016548463357e-06,
749
- "loss": 0.0586,
750
- "step": 840
751
- },
752
- {
753
- "epoch": 18.0,
754
- "eval_accuracy": 1.0,
755
- "eval_loss": 0.0009691208251751959,
756
- "eval_runtime": 8.4381,
757
- "eval_samples_per_second": 75.373,
758
- "eval_steps_per_second": 2.37,
759
- "step": 846
760
- },
761
- {
762
- "epoch": 18.085106382978722,
763
- "grad_norm": 3.351142406463623,
764
- "learning_rate": 5.319148936170213e-06,
765
- "loss": 0.0333,
766
- "step": 850
767
- },
768
- {
769
- "epoch": 18.29787234042553,
770
- "grad_norm": 7.570976257324219,
771
- "learning_rate": 4.728132387706856e-06,
772
- "loss": 0.0401,
773
- "step": 860
774
- },
775
- {
776
- "epoch": 18.51063829787234,
777
- "grad_norm": 6.660007953643799,
778
- "learning_rate": 4.137115839243498e-06,
779
- "loss": 0.0329,
780
- "step": 870
781
- },
782
- {
783
- "epoch": 18.72340425531915,
784
- "grad_norm": 11.373592376708984,
785
- "learning_rate": 3.5460992907801423e-06,
786
- "loss": 0.0523,
787
- "step": 880
788
- },
789
- {
790
- "epoch": 18.93617021276596,
791
- "grad_norm": 4.553757667541504,
792
- "learning_rate": 2.955082742316785e-06,
793
- "loss": 0.0557,
794
- "step": 890
795
- },
796
- {
797
- "epoch": 19.0,
798
- "eval_accuracy": 1.0,
799
- "eval_loss": 0.0012750416062772274,
800
- "eval_runtime": 8.3957,
801
- "eval_samples_per_second": 75.753,
802
- "eval_steps_per_second": 2.382,
803
- "step": 893
804
- },
805
- {
806
- "epoch": 19.148936170212767,
807
- "grad_norm": 1.8293001651763916,
808
- "learning_rate": 2.364066193853428e-06,
809
- "loss": 0.0248,
810
- "step": 900
811
- },
812
- {
813
- "epoch": 19.361702127659573,
814
- "grad_norm": 3.5974695682525635,
815
- "learning_rate": 1.7730496453900712e-06,
816
- "loss": 0.0298,
817
- "step": 910
818
- },
819
- {
820
- "epoch": 19.574468085106382,
821
- "grad_norm": 4.631837844848633,
822
- "learning_rate": 1.182033096926714e-06,
823
- "loss": 0.0272,
824
- "step": 920
825
- },
826
- {
827
- "epoch": 19.78723404255319,
828
- "grad_norm": 1.5552431344985962,
829
- "learning_rate": 5.91016548463357e-07,
830
- "loss": 0.0281,
831
- "step": 930
832
- },
833
- {
834
- "epoch": 20.0,
835
- "grad_norm": 5.388515949249268,
836
- "learning_rate": 0.0,
837
- "loss": 0.07,
838
- "step": 940
839
- },
840
- {
841
- "epoch": 20.0,
842
- "eval_accuracy": 1.0,
843
- "eval_loss": 0.001178326434455812,
844
- "eval_runtime": 8.4555,
845
- "eval_samples_per_second": 75.217,
846
- "eval_steps_per_second": 2.365,
847
- "step": 940
848
- },
849
- {
850
- "epoch": 20.0,
851
- "step": 940,
852
- "total_flos": 6.302667737382912e+17,
853
- "train_loss": 0.13929352825309368,
854
- "train_runtime": 703.6937,
855
- "train_samples_per_second": 42.177,
856
- "train_steps_per_second": 1.336
857
  }
858
  ],
859
  "logging_steps": 10,
860
- "max_steps": 940,
861
  "num_input_tokens_seen": 0,
862
- "num_train_epochs": 20,
863
  "save_steps": 500,
864
- "total_flos": 6.302667737382912e+17,
865
  "train_batch_size": 32,
866
  "trial_name": null,
867
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.4889937106918239,
3
+ "best_model_checkpoint": "resnet-50-finetuned-student_kaggle/checkpoint-47",
4
+ "epoch": 5.0,
5
  "eval_steps": 500,
6
+ "global_step": 235,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.2127659574468085,
13
+ "grad_norm": NaN,
14
+ "learning_rate": 2.0833333333333336e-05,
15
+ "loss": 0.0,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.425531914893617,
20
+ "grad_norm": NaN,
21
+ "learning_rate": 4.166666666666667e-05,
22
+ "loss": 0.0,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.6382978723404256,
27
+ "grad_norm": NaN,
28
+ "learning_rate": 4.857819905213271e-05,
29
+ "loss": 0.0,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.851063829787234,
34
+ "grad_norm": NaN,
35
+ "learning_rate": 4.620853080568721e-05,
36
+ "loss": 0.0,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 1.0,
41
+ "eval_accuracy": 0.4889937106918239,
42
+ "eval_loss": NaN,
43
+ "eval_runtime": 8.1507,
44
+ "eval_samples_per_second": 78.03,
45
+ "eval_steps_per_second": 2.454,
46
  "step": 47
47
  },
48
  {
49
  "epoch": 1.0638297872340425,
50
+ "grad_norm": NaN,
51
+ "learning_rate": 4.383886255924171e-05,
52
+ "loss": 0.0,
53
  "step": 50
54
  },
55
  {
56
  "epoch": 1.2765957446808511,
57
+ "grad_norm": NaN,
58
+ "learning_rate": 4.146919431279621e-05,
59
+ "loss": 0.0,
60
  "step": 60
61
  },
62
  {
63
  "epoch": 1.4893617021276595,
64
+ "grad_norm": NaN,
65
+ "learning_rate": 3.909952606635071e-05,
66
+ "loss": 0.0,
67
  "step": 70
68
  },
69
  {
70
  "epoch": 1.702127659574468,
71
+ "grad_norm": NaN,
72
+ "learning_rate": 3.672985781990522e-05,
73
+ "loss": 0.0,
74
  "step": 80
75
  },
76
  {
77
  "epoch": 1.9148936170212765,
78
+ "grad_norm": NaN,
79
+ "learning_rate": 3.4360189573459716e-05,
80
+ "loss": 0.0,
81
  "step": 90
82
  },
83
  {
84
  "epoch": 2.0,
85
+ "eval_accuracy": 0.4889937106918239,
86
+ "eval_loss": NaN,
87
+ "eval_runtime": 8.3873,
88
+ "eval_samples_per_second": 75.829,
89
+ "eval_steps_per_second": 2.385,
90
  "step": 94
91
  },
92
  {
93
  "epoch": 2.127659574468085,
94
+ "grad_norm": NaN,
95
+ "learning_rate": 3.1990521327014215e-05,
96
+ "loss": 0.0,
97
  "step": 100
98
  },
99
  {
100
  "epoch": 2.3404255319148937,
101
+ "grad_norm": NaN,
102
+ "learning_rate": 2.962085308056872e-05,
103
+ "loss": 0.0,
104
  "step": 110
105
  },
106
  {
107
  "epoch": 2.5531914893617023,
108
+ "grad_norm": NaN,
109
+ "learning_rate": 2.7251184834123224e-05,
110
+ "loss": 0.0,
111
  "step": 120
112
  },
113
  {
114
  "epoch": 2.7659574468085104,
115
+ "grad_norm": NaN,
116
+ "learning_rate": 2.4881516587677726e-05,
117
+ "loss": 0.0,
118
  "step": 130
119
  },
120
  {
121
  "epoch": 2.978723404255319,
122
+ "grad_norm": NaN,
123
+ "learning_rate": 2.251184834123223e-05,
124
+ "loss": 0.0,
125
  "step": 140
126
  },
127
  {
128
  "epoch": 3.0,
129
+ "eval_accuracy": 0.4889937106918239,
130
+ "eval_loss": NaN,
131
+ "eval_runtime": 7.8854,
132
+ "eval_samples_per_second": 80.655,
133
+ "eval_steps_per_second": 2.536,
134
  "step": 141
135
  },
136
  {
137
  "epoch": 3.1914893617021276,
138
+ "grad_norm": NaN,
139
+ "learning_rate": 2.014218009478673e-05,
140
+ "loss": 0.0,
141
  "step": 150
142
  },
143
  {
144
  "epoch": 3.404255319148936,
145
+ "grad_norm": NaN,
146
+ "learning_rate": 1.7772511848341233e-05,
147
+ "loss": 0.0,
148
  "step": 160
149
  },
150
  {
151
  "epoch": 3.617021276595745,
152
+ "grad_norm": NaN,
153
+ "learning_rate": 1.5402843601895736e-05,
154
+ "loss": 0.0,
155
  "step": 170
156
  },
157
  {
158
  "epoch": 3.829787234042553,
159
+ "grad_norm": NaN,
160
+ "learning_rate": 1.3033175355450238e-05,
161
+ "loss": 0.0,
162
  "step": 180
163
  },
164
  {
165
  "epoch": 4.0,
166
+ "eval_accuracy": 0.4889937106918239,
167
+ "eval_loss": NaN,
168
+ "eval_runtime": 8.0232,
169
+ "eval_samples_per_second": 79.27,
170
+ "eval_steps_per_second": 2.493,
171
  "step": 188
172
  },
173
  {
174
  "epoch": 4.042553191489362,
175
+ "grad_norm": NaN,
176
+ "learning_rate": 1.066350710900474e-05,
177
+ "loss": 0.0,
178
  "step": 190
179
  },
180
  {
181
  "epoch": 4.25531914893617,
182
+ "grad_norm": NaN,
183
+ "learning_rate": 8.293838862559241e-06,
184
+ "loss": 0.0,
185
  "step": 200
186
  },
187
  {
188
  "epoch": 4.468085106382979,
189
+ "grad_norm": NaN,
190
+ "learning_rate": 5.924170616113745e-06,
191
+ "loss": 0.0,
192
  "step": 210
193
  },
194
  {
195
  "epoch": 4.680851063829787,
196
+ "grad_norm": NaN,
197
+ "learning_rate": 3.5545023696682464e-06,
198
+ "loss": 0.0,
199
  "step": 220
200
  },
201
  {
202
  "epoch": 4.8936170212765955,
203
+ "grad_norm": NaN,
204
+ "learning_rate": 1.1848341232227488e-06,
205
+ "loss": 0.0,
206
  "step": 230
207
  },
208
  {
209
  "epoch": 5.0,
210
+ "eval_accuracy": 0.4889937106918239,
211
+ "eval_loss": NaN,
212
+ "eval_runtime": 8.3163,
213
+ "eval_samples_per_second": 76.476,
214
+ "eval_steps_per_second": 2.405,
215
  "step": 235
216
  },
217
  {
218
+ "epoch": 5.0,
219
+ "step": 235,
220
+ "total_flos": 1.575666934345728e+17,
221
+ "train_loss": 0.0,
222
+ "train_runtime": 166.8151,
223
+ "train_samples_per_second": 44.48,
224
+ "train_steps_per_second": 1.409
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  }
226
  ],
227
  "logging_steps": 10,
228
+ "max_steps": 235,
229
  "num_input_tokens_seen": 0,
230
+ "num_train_epochs": 5,
231
  "save_steps": 500,
232
+ "total_flos": 1.575666934345728e+17,
233
  "train_batch_size": 32,
234
  "trial_name": null,
235
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e86333774fdc2cb3ed0a9195f9238c2f5423847f1665b9f8b4b6f56508f38d3c
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8121ae27cdcd1b7b5c4adc8363f0af01cc99f2b977ff73e68a871ff6fbd7636
3
  size 5048