system-admin commited on
Commit
717c124
1 Parent(s): 9bdf0c8

Training in progress, epoch 1

Browse files
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 20.0,
3
- "eval_accuracy": 0.87,
4
- "eval_loss": 0.40429016947746277,
5
- "eval_runtime": 2.5766,
6
- "eval_samples_per_second": 38.811,
7
- "eval_steps_per_second": 1.552,
8
  "total_flos": 1.98847911886848e+17,
9
- "train_loss": 0.30984322795501124,
10
- "train_runtime": 343.6308,
11
- "train_samples_per_second": 23.281,
12
- "train_steps_per_second": 0.757
13
  }
 
1
  {
2
  "epoch": 20.0,
3
+ "eval_accuracy": 0.84,
4
+ "eval_loss": 0.7047725915908813,
5
+ "eval_runtime": 3.1923,
6
+ "eval_samples_per_second": 31.325,
7
+ "eval_steps_per_second": 1.253,
8
  "total_flos": 1.98847911886848e+17,
9
+ "train_loss": 0.40210363498100865,
10
+ "train_runtime": 345.8785,
11
+ "train_samples_per_second": 23.13,
12
+ "train_steps_per_second": 0.752
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 20.0,
3
- "eval_accuracy": 0.87,
4
- "eval_loss": 0.40429016947746277,
5
- "eval_runtime": 2.5766,
6
- "eval_samples_per_second": 38.811,
7
- "eval_steps_per_second": 1.552
8
  }
 
1
  {
2
  "epoch": 20.0,
3
+ "eval_accuracy": 0.84,
4
+ "eval_loss": 0.7047725915908813,
5
+ "eval_runtime": 3.1923,
6
+ "eval_samples_per_second": 31.325,
7
+ "eval_steps_per_second": 1.253
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:327a1698bb22bcde61157dc1342240b6315f16a0435717d989809d3af5d8502b
3
  size 110342832
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64c181a6685b40fca78c6b85bc77336c22dab4e2b1b275ffb0d9a459e4370c7a
3
  size 110342832
runs/May01_15-13-06_d6eb9620a945/events.out.tfevents.1714577258.d6eb9620a945.304.7 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9b65cd5b81a475ab83d1aacd11f33c81f6346365c8da06f94013a957e6622d5
3
+ size 411
runs/May01_15-28-52_d6eb9620a945/events.out.tfevents.1714577382.d6eb9620a945.304.8 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19d24b57d222e167895ef979aaf24e486ca766cb62e2b2d85a3858610f069741
3
+ size 5776
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 20.0,
3
  "total_flos": 1.98847911886848e+17,
4
- "train_loss": 0.30984322795501124,
5
- "train_runtime": 343.6308,
6
- "train_samples_per_second": 23.281,
7
- "train_steps_per_second": 0.757
8
  }
 
1
  {
2
  "epoch": 20.0,
3
  "total_flos": 1.98847911886848e+17,
4
+ "train_loss": 0.40210363498100865,
5
+ "train_runtime": 345.8785,
6
+ "train_samples_per_second": 23.13,
7
+ "train_steps_per_second": 0.752
8
  }
trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "best_metric": 0.87,
3
- "best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-student_two_classes/checkpoint-117",
4
  "epoch": 20.0,
5
  "eval_steps": 500,
6
  "global_step": 260,
@@ -10,374 +10,374 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.7692307692307693,
13
- "grad_norm": 11.056398391723633,
14
  "learning_rate": 1.923076923076923e-05,
15
- "loss": 0.6951,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 1.0,
20
- "eval_accuracy": 0.82,
21
- "eval_loss": 0.4448259770870209,
22
- "eval_runtime": 3.0273,
23
- "eval_samples_per_second": 33.032,
24
- "eval_steps_per_second": 1.321,
25
  "step": 13
26
  },
27
  {
28
  "epoch": 1.5384615384615383,
29
- "grad_norm": 6.355745315551758,
30
  "learning_rate": 3.846153846153846e-05,
31
- "loss": 0.4292,
32
  "step": 20
33
  },
34
  {
35
  "epoch": 2.0,
36
- "eval_accuracy": 0.82,
37
- "eval_loss": 0.44606465101242065,
38
- "eval_runtime": 2.4722,
39
- "eval_samples_per_second": 40.45,
40
- "eval_steps_per_second": 1.618,
41
  "step": 26
42
  },
43
  {
44
  "epoch": 2.3076923076923075,
45
- "grad_norm": 5.6881914138793945,
46
  "learning_rate": 4.9145299145299147e-05,
47
- "loss": 0.4246,
48
  "step": 30
49
  },
50
  {
51
  "epoch": 3.0,
52
  "eval_accuracy": 0.82,
53
- "eval_loss": 0.4553927481174469,
54
- "eval_runtime": 2.5808,
55
- "eval_samples_per_second": 38.748,
56
- "eval_steps_per_second": 1.55,
57
  "step": 39
58
  },
59
  {
60
  "epoch": 3.076923076923077,
61
- "grad_norm": 9.89670181274414,
62
  "learning_rate": 4.700854700854701e-05,
63
- "loss": 0.4424,
64
  "step": 40
65
  },
66
  {
67
  "epoch": 3.8461538461538463,
68
- "grad_norm": 6.457494258880615,
69
  "learning_rate": 4.4871794871794874e-05,
70
- "loss": 0.3983,
71
  "step": 50
72
  },
73
  {
74
  "epoch": 4.0,
75
- "eval_accuracy": 0.83,
76
- "eval_loss": 0.4219551384449005,
77
- "eval_runtime": 2.9766,
78
- "eval_samples_per_second": 33.596,
79
- "eval_steps_per_second": 1.344,
80
  "step": 52
81
  },
82
  {
83
  "epoch": 4.615384615384615,
84
- "grad_norm": 5.4807610511779785,
85
  "learning_rate": 4.2735042735042735e-05,
86
- "loss": 0.314,
87
  "step": 60
88
  },
89
  {
90
  "epoch": 5.0,
91
- "eval_accuracy": 0.83,
92
- "eval_loss": 0.44294700026512146,
93
- "eval_runtime": 2.417,
94
- "eval_samples_per_second": 41.374,
95
- "eval_steps_per_second": 1.655,
96
  "step": 65
97
  },
98
  {
99
  "epoch": 5.384615384615385,
100
- "grad_norm": 5.0344414710998535,
101
  "learning_rate": 4.05982905982906e-05,
102
- "loss": 0.4176,
103
  "step": 70
104
  },
105
  {
106
  "epoch": 6.0,
107
- "eval_accuracy": 0.82,
108
- "eval_loss": 0.4005734324455261,
109
- "eval_runtime": 3.0995,
110
- "eval_samples_per_second": 32.264,
111
- "eval_steps_per_second": 1.291,
112
  "step": 78
113
  },
114
  {
115
  "epoch": 6.153846153846154,
116
- "grad_norm": 6.132039546966553,
117
  "learning_rate": 3.846153846153846e-05,
118
- "loss": 0.3512,
119
  "step": 80
120
  },
121
  {
122
  "epoch": 6.923076923076923,
123
- "grad_norm": 3.423215389251709,
124
  "learning_rate": 3.6324786324786323e-05,
125
- "loss": 0.2862,
126
  "step": 90
127
  },
128
  {
129
  "epoch": 7.0,
130
- "eval_accuracy": 0.84,
131
- "eval_loss": 0.4145370423793793,
132
- "eval_runtime": 2.5336,
133
- "eval_samples_per_second": 39.47,
134
- "eval_steps_per_second": 1.579,
135
  "step": 91
136
  },
137
  {
138
  "epoch": 7.6923076923076925,
139
- "grad_norm": 4.900909423828125,
140
  "learning_rate": 3.418803418803419e-05,
141
- "loss": 0.3072,
142
  "step": 100
143
  },
144
  {
145
  "epoch": 8.0,
146
- "eval_accuracy": 0.83,
147
- "eval_loss": 0.38473400473594666,
148
- "eval_runtime": 2.5184,
149
- "eval_samples_per_second": 39.708,
150
- "eval_steps_per_second": 1.588,
151
  "step": 104
152
  },
153
  {
154
  "epoch": 8.461538461538462,
155
- "grad_norm": 6.61647891998291,
156
  "learning_rate": 3.205128205128206e-05,
157
- "loss": 0.3001,
158
  "step": 110
159
  },
160
  {
161
  "epoch": 9.0,
162
- "eval_accuracy": 0.87,
163
- "eval_loss": 0.40429016947746277,
164
- "eval_runtime": 3.3603,
165
- "eval_samples_per_second": 29.76,
166
- "eval_steps_per_second": 1.19,
167
  "step": 117
168
  },
169
  {
170
  "epoch": 9.23076923076923,
171
- "grad_norm": 9.093049049377441,
172
  "learning_rate": 2.9914529914529915e-05,
173
- "loss": 0.3225,
174
  "step": 120
175
  },
176
  {
177
  "epoch": 10.0,
178
- "grad_norm": 8.699834823608398,
179
  "learning_rate": 2.777777777777778e-05,
180
- "loss": 0.2937,
181
  "step": 130
182
  },
183
  {
184
  "epoch": 10.0,
185
- "eval_accuracy": 0.82,
186
- "eval_loss": 0.40262600779533386,
187
- "eval_runtime": 2.4958,
188
- "eval_samples_per_second": 40.067,
189
- "eval_steps_per_second": 1.603,
190
  "step": 130
191
  },
192
  {
193
  "epoch": 10.76923076923077,
194
- "grad_norm": 3.0456953048706055,
195
  "learning_rate": 2.564102564102564e-05,
196
- "loss": 0.2206,
197
  "step": 140
198
  },
199
  {
200
  "epoch": 11.0,
201
- "eval_accuracy": 0.83,
202
- "eval_loss": 0.3972433805465698,
203
- "eval_runtime": 2.6788,
204
- "eval_samples_per_second": 37.329,
205
- "eval_steps_per_second": 1.493,
206
  "step": 143
207
  },
208
  {
209
  "epoch": 11.538461538461538,
210
- "grad_norm": 6.638078689575195,
211
  "learning_rate": 2.3504273504273504e-05,
212
- "loss": 0.2287,
213
  "step": 150
214
  },
215
  {
216
  "epoch": 12.0,
217
- "eval_accuracy": 0.86,
218
- "eval_loss": 0.38398703932762146,
219
- "eval_runtime": 2.4563,
220
- "eval_samples_per_second": 40.711,
221
  "eval_steps_per_second": 1.628,
222
  "step": 156
223
  },
224
  {
225
  "epoch": 12.307692307692308,
226
- "grad_norm": 6.832516670227051,
227
  "learning_rate": 2.1367521367521368e-05,
228
- "loss": 0.3318,
229
  "step": 160
230
  },
231
  {
232
  "epoch": 13.0,
233
- "eval_accuracy": 0.84,
234
- "eval_loss": 0.3740682899951935,
235
- "eval_runtime": 2.6452,
236
- "eval_samples_per_second": 37.805,
237
- "eval_steps_per_second": 1.512,
238
  "step": 169
239
  },
240
  {
241
  "epoch": 13.076923076923077,
242
- "grad_norm": 6.917372226715088,
243
  "learning_rate": 1.923076923076923e-05,
244
- "loss": 0.2099,
245
  "step": 170
246
  },
247
  {
248
  "epoch": 13.846153846153847,
249
- "grad_norm": 6.500430107116699,
250
  "learning_rate": 1.7094017094017095e-05,
251
- "loss": 0.232,
252
  "step": 180
253
  },
254
  {
255
  "epoch": 14.0,
256
- "eval_accuracy": 0.85,
257
- "eval_loss": 0.38503700494766235,
258
- "eval_runtime": 2.5332,
259
- "eval_samples_per_second": 39.476,
260
- "eval_steps_per_second": 1.579,
261
  "step": 182
262
  },
263
  {
264
  "epoch": 14.615384615384615,
265
- "grad_norm": 7.342576503753662,
266
  "learning_rate": 1.4957264957264958e-05,
267
- "loss": 0.2277,
268
  "step": 190
269
  },
270
  {
271
  "epoch": 15.0,
272
- "eval_accuracy": 0.85,
273
- "eval_loss": 0.3989230692386627,
274
- "eval_runtime": 2.5166,
275
- "eval_samples_per_second": 39.737,
276
- "eval_steps_per_second": 1.589,
277
  "step": 195
278
  },
279
  {
280
  "epoch": 15.384615384615385,
281
- "grad_norm": 6.8460235595703125,
282
  "learning_rate": 1.282051282051282e-05,
283
- "loss": 0.2253,
284
  "step": 200
285
  },
286
  {
287
  "epoch": 16.0,
288
- "eval_accuracy": 0.85,
289
- "eval_loss": 0.4070873260498047,
290
- "eval_runtime": 3.0059,
291
- "eval_samples_per_second": 33.267,
292
- "eval_steps_per_second": 1.331,
293
  "step": 208
294
  },
295
  {
296
  "epoch": 16.153846153846153,
297
- "grad_norm": 3.827336311340332,
298
  "learning_rate": 1.0683760683760684e-05,
299
- "loss": 0.2202,
300
  "step": 210
301
  },
302
  {
303
  "epoch": 16.923076923076923,
304
- "grad_norm": 6.873669624328613,
305
  "learning_rate": 8.547008547008548e-06,
306
- "loss": 0.2463,
307
  "step": 220
308
  },
309
  {
310
  "epoch": 17.0,
311
- "eval_accuracy": 0.85,
312
- "eval_loss": 0.40268489718437195,
313
- "eval_runtime": 2.5105,
314
- "eval_samples_per_second": 39.833,
315
- "eval_steps_per_second": 1.593,
316
  "step": 221
317
  },
318
  {
319
  "epoch": 17.692307692307693,
320
- "grad_norm": 8.530336380004883,
321
  "learning_rate": 6.41025641025641e-06,
322
- "loss": 0.2496,
323
  "step": 230
324
  },
325
  {
326
  "epoch": 18.0,
327
- "eval_accuracy": 0.83,
328
- "eval_loss": 0.41463416814804077,
329
- "eval_runtime": 2.5445,
330
- "eval_samples_per_second": 39.301,
331
- "eval_steps_per_second": 1.572,
332
  "step": 234
333
  },
334
  {
335
  "epoch": 18.46153846153846,
336
- "grad_norm": 3.6479477882385254,
337
  "learning_rate": 4.273504273504274e-06,
338
- "loss": 0.1969,
339
  "step": 240
340
  },
341
  {
342
  "epoch": 19.0,
343
- "eval_accuracy": 0.83,
344
- "eval_loss": 0.41043660044670105,
345
- "eval_runtime": 3.1127,
346
- "eval_samples_per_second": 32.127,
347
- "eval_steps_per_second": 1.285,
348
  "step": 247
349
  },
350
  {
351
  "epoch": 19.23076923076923,
352
- "grad_norm": 5.254897117614746,
353
  "learning_rate": 2.136752136752137e-06,
354
- "loss": 0.257,
355
  "step": 250
356
  },
357
  {
358
  "epoch": 20.0,
359
- "grad_norm": 8.73101806640625,
360
  "learning_rate": 0.0,
361
- "loss": 0.2279,
362
  "step": 260
363
  },
364
  {
365
  "epoch": 20.0,
366
- "eval_accuracy": 0.82,
367
- "eval_loss": 0.41871729493141174,
368
- "eval_runtime": 2.516,
369
- "eval_samples_per_second": 39.746,
370
- "eval_steps_per_second": 1.59,
371
  "step": 260
372
  },
373
  {
374
  "epoch": 20.0,
375
  "step": 260,
376
  "total_flos": 1.98847911886848e+17,
377
- "train_loss": 0.30984322795501124,
378
- "train_runtime": 343.6308,
379
- "train_samples_per_second": 23.281,
380
- "train_steps_per_second": 0.757
381
  }
382
  ],
383
  "logging_steps": 10,
 
1
  {
2
+ "best_metric": 0.84,
3
+ "best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-student_two_classes/checkpoint-104",
4
  "epoch": 20.0,
5
  "eval_steps": 500,
6
  "global_step": 260,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.7692307692307693,
13
+ "grad_norm": 7.759917736053467,
14
  "learning_rate": 1.923076923076923e-05,
15
+ "loss": 0.3845,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 1.0,
20
+ "eval_accuracy": 0.7,
21
+ "eval_loss": 0.6474852561950684,
22
+ "eval_runtime": 2.509,
23
+ "eval_samples_per_second": 39.856,
24
+ "eval_steps_per_second": 1.594,
25
  "step": 13
26
  },
27
  {
28
  "epoch": 1.5384615384615383,
29
+ "grad_norm": 22.67352294921875,
30
  "learning_rate": 3.846153846153846e-05,
31
+ "loss": 0.3466,
32
  "step": 20
33
  },
34
  {
35
  "epoch": 2.0,
36
+ "eval_accuracy": 0.74,
37
+ "eval_loss": 0.6201274991035461,
38
+ "eval_runtime": 2.4483,
39
+ "eval_samples_per_second": 40.845,
40
+ "eval_steps_per_second": 1.634,
41
  "step": 26
42
  },
43
  {
44
  "epoch": 2.3076923076923075,
45
+ "grad_norm": 68.59841918945312,
46
  "learning_rate": 4.9145299145299147e-05,
47
+ "loss": 0.3832,
48
  "step": 30
49
  },
50
  {
51
  "epoch": 3.0,
52
  "eval_accuracy": 0.82,
53
+ "eval_loss": 0.8067967295646667,
54
+ "eval_runtime": 2.4397,
55
+ "eval_samples_per_second": 40.989,
56
+ "eval_steps_per_second": 1.64,
57
  "step": 39
58
  },
59
  {
60
  "epoch": 3.076923076923077,
61
+ "grad_norm": 16.920194625854492,
62
  "learning_rate": 4.700854700854701e-05,
63
+ "loss": 0.4694,
64
  "step": 40
65
  },
66
  {
67
  "epoch": 3.8461538461538463,
68
+ "grad_norm": 59.7691764831543,
69
  "learning_rate": 4.4871794871794874e-05,
70
+ "loss": 0.5344,
71
  "step": 50
72
  },
73
  {
74
  "epoch": 4.0,
75
+ "eval_accuracy": 0.81,
76
+ "eval_loss": 0.6339858770370483,
77
+ "eval_runtime": 2.4546,
78
+ "eval_samples_per_second": 40.74,
79
+ "eval_steps_per_second": 1.63,
80
  "step": 52
81
  },
82
  {
83
  "epoch": 4.615384615384615,
84
+ "grad_norm": 21.54722785949707,
85
  "learning_rate": 4.2735042735042735e-05,
86
+ "loss": 0.4912,
87
  "step": 60
88
  },
89
  {
90
  "epoch": 5.0,
91
+ "eval_accuracy": 0.8,
92
+ "eval_loss": 0.6879615187644958,
93
+ "eval_runtime": 2.8473,
94
+ "eval_samples_per_second": 35.121,
95
+ "eval_steps_per_second": 1.405,
96
  "step": 65
97
  },
98
  {
99
  "epoch": 5.384615384615385,
100
+ "grad_norm": 11.457103729248047,
101
  "learning_rate": 4.05982905982906e-05,
102
+ "loss": 0.5093,
103
  "step": 70
104
  },
105
  {
106
  "epoch": 6.0,
107
+ "eval_accuracy": 0.73,
108
+ "eval_loss": 0.699913501739502,
109
+ "eval_runtime": 2.4666,
110
+ "eval_samples_per_second": 40.541,
111
+ "eval_steps_per_second": 1.622,
112
  "step": 78
113
  },
114
  {
115
  "epoch": 6.153846153846154,
116
+ "grad_norm": 15.323335647583008,
117
  "learning_rate": 3.846153846153846e-05,
118
+ "loss": 0.5284,
119
  "step": 80
120
  },
121
  {
122
  "epoch": 6.923076923076923,
123
+ "grad_norm": 7.947178840637207,
124
  "learning_rate": 3.6324786324786323e-05,
125
+ "loss": 0.4109,
126
  "step": 90
127
  },
128
  {
129
  "epoch": 7.0,
130
+ "eval_accuracy": 0.83,
131
+ "eval_loss": 0.7294943928718567,
132
+ "eval_runtime": 2.4607,
133
+ "eval_samples_per_second": 40.638,
134
+ "eval_steps_per_second": 1.626,
135
  "step": 91
136
  },
137
  {
138
  "epoch": 7.6923076923076925,
139
+ "grad_norm": 9.474274635314941,
140
  "learning_rate": 3.418803418803419e-05,
141
+ "loss": 0.4383,
142
  "step": 100
143
  },
144
  {
145
  "epoch": 8.0,
146
+ "eval_accuracy": 0.84,
147
+ "eval_loss": 0.7047725915908813,
148
+ "eval_runtime": 2.8894,
149
+ "eval_samples_per_second": 34.609,
150
+ "eval_steps_per_second": 1.384,
151
  "step": 104
152
  },
153
  {
154
  "epoch": 8.461538461538462,
155
+ "grad_norm": 7.170910835266113,
156
  "learning_rate": 3.205128205128206e-05,
157
+ "loss": 0.4534,
158
  "step": 110
159
  },
160
  {
161
  "epoch": 9.0,
162
+ "eval_accuracy": 0.82,
163
+ "eval_loss": 0.609440803527832,
164
+ "eval_runtime": 2.4268,
165
+ "eval_samples_per_second": 41.206,
166
+ "eval_steps_per_second": 1.648,
167
  "step": 117
168
  },
169
  {
170
  "epoch": 9.23076923076923,
171
+ "grad_norm": 13.206290245056152,
172
  "learning_rate": 2.9914529914529915e-05,
173
+ "loss": 0.4504,
174
  "step": 120
175
  },
176
  {
177
  "epoch": 10.0,
178
+ "grad_norm": 15.387145042419434,
179
  "learning_rate": 2.777777777777778e-05,
180
+ "loss": 0.4684,
181
  "step": 130
182
  },
183
  {
184
  "epoch": 10.0,
185
+ "eval_accuracy": 0.74,
186
+ "eval_loss": 0.5788578987121582,
187
+ "eval_runtime": 2.5101,
188
+ "eval_samples_per_second": 39.839,
189
+ "eval_steps_per_second": 1.594,
190
  "step": 130
191
  },
192
  {
193
  "epoch": 10.76923076923077,
194
+ "grad_norm": 8.540888786315918,
195
  "learning_rate": 2.564102564102564e-05,
196
+ "loss": 0.3442,
197
  "step": 140
198
  },
199
  {
200
  "epoch": 11.0,
201
+ "eval_accuracy": 0.82,
202
+ "eval_loss": 0.7296608090400696,
203
+ "eval_runtime": 3.5919,
204
+ "eval_samples_per_second": 27.84,
205
+ "eval_steps_per_second": 1.114,
206
  "step": 143
207
  },
208
  {
209
  "epoch": 11.538461538461538,
210
+ "grad_norm": 7.705536842346191,
211
  "learning_rate": 2.3504273504273504e-05,
212
+ "loss": 0.3236,
213
  "step": 150
214
  },
215
  {
216
  "epoch": 12.0,
217
+ "eval_accuracy": 0.79,
218
+ "eval_loss": 0.7688478231430054,
219
+ "eval_runtime": 2.457,
220
+ "eval_samples_per_second": 40.7,
221
  "eval_steps_per_second": 1.628,
222
  "step": 156
223
  },
224
  {
225
  "epoch": 12.307692307692308,
226
+ "grad_norm": 4.703495502471924,
227
  "learning_rate": 2.1367521367521368e-05,
228
+ "loss": 0.4645,
229
  "step": 160
230
  },
231
  {
232
  "epoch": 13.0,
233
+ "eval_accuracy": 0.76,
234
+ "eval_loss": 0.6686670780181885,
235
+ "eval_runtime": 2.6242,
236
+ "eval_samples_per_second": 38.107,
237
+ "eval_steps_per_second": 1.524,
238
  "step": 169
239
  },
240
  {
241
  "epoch": 13.076923076923077,
242
+ "grad_norm": 12.792634010314941,
243
  "learning_rate": 1.923076923076923e-05,
244
+ "loss": 0.3457,
245
  "step": 170
246
  },
247
  {
248
  "epoch": 13.846153846153847,
249
+ "grad_norm": 8.466885566711426,
250
  "learning_rate": 1.7094017094017095e-05,
251
+ "loss": 0.3532,
252
  "step": 180
253
  },
254
  {
255
  "epoch": 14.0,
256
+ "eval_accuracy": 0.84,
257
+ "eval_loss": 0.787961483001709,
258
+ "eval_runtime": 2.4969,
259
+ "eval_samples_per_second": 40.05,
260
+ "eval_steps_per_second": 1.602,
261
  "step": 182
262
  },
263
  {
264
  "epoch": 14.615384615384615,
265
+ "grad_norm": 13.883042335510254,
266
  "learning_rate": 1.4957264957264958e-05,
267
+ "loss": 0.3394,
268
  "step": 190
269
  },
270
  {
271
  "epoch": 15.0,
272
+ "eval_accuracy": 0.79,
273
+ "eval_loss": 0.7216033935546875,
274
+ "eval_runtime": 2.5404,
275
+ "eval_samples_per_second": 39.364,
276
+ "eval_steps_per_second": 1.575,
277
  "step": 195
278
  },
279
  {
280
  "epoch": 15.384615384615385,
281
+ "grad_norm": 8.006115913391113,
282
  "learning_rate": 1.282051282051282e-05,
283
+ "loss": 0.3311,
284
  "step": 200
285
  },
286
  {
287
  "epoch": 16.0,
288
+ "eval_accuracy": 0.79,
289
+ "eval_loss": 0.7209141254425049,
290
+ "eval_runtime": 4.7912,
291
+ "eval_samples_per_second": 20.872,
292
+ "eval_steps_per_second": 0.835,
293
  "step": 208
294
  },
295
  {
296
  "epoch": 16.153846153846153,
297
+ "grad_norm": 5.578493118286133,
298
  "learning_rate": 1.0683760683760684e-05,
299
+ "loss": 0.3509,
300
  "step": 210
301
  },
302
  {
303
  "epoch": 16.923076923076923,
304
+ "grad_norm": 6.166889190673828,
305
  "learning_rate": 8.547008547008548e-06,
306
+ "loss": 0.3367,
307
  "step": 220
308
  },
309
  {
310
  "epoch": 17.0,
311
+ "eval_accuracy": 0.71,
312
+ "eval_loss": 0.6826711297035217,
313
+ "eval_runtime": 2.5244,
314
+ "eval_samples_per_second": 39.614,
315
+ "eval_steps_per_second": 1.585,
316
  "step": 221
317
  },
318
  {
319
  "epoch": 17.692307692307693,
320
+ "grad_norm": 10.269214630126953,
321
  "learning_rate": 6.41025641025641e-06,
322
+ "loss": 0.3673,
323
  "step": 230
324
  },
325
  {
326
  "epoch": 18.0,
327
+ "eval_accuracy": 0.76,
328
+ "eval_loss": 0.7472490072250366,
329
+ "eval_runtime": 3.2931,
330
+ "eval_samples_per_second": 30.367,
331
+ "eval_steps_per_second": 1.215,
332
  "step": 234
333
  },
334
  {
335
  "epoch": 18.46153846153846,
336
+ "grad_norm": 7.079315662384033,
337
  "learning_rate": 4.273504273504274e-06,
338
+ "loss": 0.3024,
339
  "step": 240
340
  },
341
  {
342
  "epoch": 19.0,
343
+ "eval_accuracy": 0.79,
344
+ "eval_loss": 0.7760630249977112,
345
+ "eval_runtime": 2.5705,
346
+ "eval_samples_per_second": 38.903,
347
+ "eval_steps_per_second": 1.556,
348
  "step": 247
349
  },
350
  {
351
  "epoch": 19.23076923076923,
352
+ "grad_norm": 7.1634039878845215,
353
  "learning_rate": 2.136752136752137e-06,
354
+ "loss": 0.3652,
355
  "step": 250
356
  },
357
  {
358
  "epoch": 20.0,
359
+ "grad_norm": 28.70159912109375,
360
  "learning_rate": 0.0,
361
+ "loss": 0.3624,
362
  "step": 260
363
  },
364
  {
365
  "epoch": 20.0,
366
+ "eval_accuracy": 0.76,
367
+ "eval_loss": 0.7436763048171997,
368
+ "eval_runtime": 2.484,
369
+ "eval_samples_per_second": 40.257,
370
+ "eval_steps_per_second": 1.61,
371
  "step": 260
372
  },
373
  {
374
  "epoch": 20.0,
375
  "step": 260,
376
  "total_flos": 1.98847911886848e+17,
377
+ "train_loss": 0.40210363498100865,
378
+ "train_runtime": 345.8785,
379
+ "train_samples_per_second": 23.13,
380
+ "train_steps_per_second": 0.752
381
  }
382
  ],
383
  "logging_steps": 10,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e52f74d254afa65d0442792843c29612d3340e5c4e9e5f971ddaf13443f39957
3
  size 5112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1a1315b4fab4894e88f79f64a4fee5ad9165a6ea5806aaa70fbbdcf4c474250
3
  size 5112