wahidww commited on
Commit
a2eca82
1 Parent(s): 0d55b17

Training in progress, epoch 1

Browse files
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 26.666666666666668,
3
- "eval_accuracy": 0.8709677419354839,
4
- "eval_loss": 0.39681780338287354,
5
- "eval_runtime": 5.4761,
6
- "eval_samples_per_second": 5.661,
7
- "eval_steps_per_second": 0.183,
8
- "total_flos": 3.811843351809884e+17,
9
- "train_loss": 0.6624618768692017,
10
- "train_runtime": 3041.8152,
11
- "train_samples_per_second": 5.671,
12
- "train_steps_per_second": 0.02
13
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "eval_accuracy": 0.8257839721254355,
4
+ "eval_loss": 0.4613528847694397,
5
+ "eval_runtime": 5.98,
6
+ "eval_samples_per_second": 143.98,
7
+ "eval_steps_per_second": 2.341,
8
+ "total_flos": 4.064614037073838e+18,
9
+ "train_loss": 0.8882290616631507,
10
+ "train_runtime": 1944.8842,
11
+ "train_samples_per_second": 84.062,
12
+ "train_steps_per_second": 0.329
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 26.666666666666668,
3
- "eval_accuracy": 0.8709677419354839,
4
- "eval_loss": 0.39681780338287354,
5
- "eval_runtime": 5.4761,
6
- "eval_samples_per_second": 5.661,
7
- "eval_steps_per_second": 0.183
8
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "eval_accuracy": 0.8257839721254355,
4
+ "eval_loss": 0.4613528847694397,
5
+ "eval_runtime": 5.98,
6
+ "eval_samples_per_second": 143.98,
7
+ "eval_steps_per_second": 2.341
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:40e356463ace1b530cc5958dab5e98777750902d65fc59d145a092749fab1bca
3
  size 110367448
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9816ed93b4eec04bf23e2baa39a074364eb3de0c2e6133c698f1a52544d9b6f7
3
  size 110367448
runs/Nov18_12-36-05_fac6bf2076b2/events.out.tfevents.1731936395.fac6bf2076b2.31.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:654bcb90f6650830fd65aa4a2217c94bbb71c57e2dd1c5d3c90a646e94a1dd88
3
+ size 411
runs/Nov18_13-26-46_fac6bf2076b2/events.out.tfevents.1731936413.fac6bf2076b2.31.3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4915d5ee1f7f08b36ecd656375c4e53252763f404642e3c3cb96d9406f3ee4b4
3
+ size 7157
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 26.666666666666668,
3
- "total_flos": 3.811843351809884e+17,
4
- "train_loss": 0.6624618768692017,
5
- "train_runtime": 3041.8152,
6
- "train_samples_per_second": 5.671,
7
- "train_steps_per_second": 0.02
8
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "total_flos": 4.064614037073838e+18,
4
+ "train_loss": 0.8882290616631507,
5
+ "train_runtime": 1944.8842,
6
+ "train_samples_per_second": 84.062,
7
+ "train_steps_per_second": 0.329
8
  }
trainer_state.json CHANGED
@@ -1,298 +1,411 @@
1
  {
2
- "best_metric": 0.8709677419354839,
3
- "best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-mobile-eye-tracking-dataset-v2/checkpoint-45",
4
- "epoch": 26.666666666666668,
5
  "eval_steps": 500,
6
- "global_step": 60,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.8888888888888888,
13
- "eval_accuracy": 0.22580645161290322,
14
- "eval_loss": 1.775622844696045,
15
- "eval_runtime": 7.2696,
16
- "eval_samples_per_second": 4.264,
17
- "eval_steps_per_second": 0.138,
18
- "step": 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  },
20
  {
21
- "epoch": 1.7777777777777777,
22
- "eval_accuracy": 0.25806451612903225,
23
- "eval_loss": 1.6783900260925293,
24
- "eval_runtime": 5.963,
25
- "eval_samples_per_second": 5.199,
26
- "eval_steps_per_second": 0.168,
27
- "step": 4
28
  },
29
  {
30
- "epoch": 2.6666666666666665,
31
- "eval_accuracy": 0.3225806451612903,
32
- "eval_loss": 1.5861129760742188,
33
- "eval_runtime": 5.4576,
34
- "eval_samples_per_second": 5.68,
35
- "eval_steps_per_second": 0.183,
36
- "step": 6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  },
38
  {
39
  "epoch": 4.0,
40
- "eval_accuracy": 0.41935483870967744,
41
- "eval_loss": 1.3571434020996094,
42
- "eval_runtime": 5.4352,
43
- "eval_samples_per_second": 5.704,
44
- "eval_steps_per_second": 0.184,
45
- "step": 9
46
- },
47
- {
48
- "epoch": 4.888888888888889,
49
- "eval_accuracy": 0.5483870967741935,
50
- "eval_loss": 1.099271535873413,
51
- "eval_runtime": 5.4311,
52
- "eval_samples_per_second": 5.708,
53
- "eval_steps_per_second": 0.184,
54
- "step": 11
55
- },
56
- {
57
- "epoch": 5.777777777777778,
58
- "eval_accuracy": 0.6451612903225806,
59
- "eval_loss": 0.9241538047790527,
60
- "eval_runtime": 5.4175,
61
- "eval_samples_per_second": 5.722,
62
- "eval_steps_per_second": 0.185,
63
- "step": 13
64
- },
65
- {
66
- "epoch": 6.666666666666667,
67
- "grad_norm": 4.35577917098999,
68
- "learning_rate": 4.166666666666667e-05,
69
- "loss": 1.4667,
70
- "step": 15
71
  },
72
  {
73
- "epoch": 6.666666666666667,
74
- "eval_accuracy": 0.7096774193548387,
75
- "eval_loss": 0.7538339495658875,
76
- "eval_runtime": 5.4027,
77
- "eval_samples_per_second": 5.738,
78
- "eval_steps_per_second": 0.185,
79
- "step": 15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  },
81
  {
82
  "epoch": 8.0,
83
- "eval_accuracy": 0.7741935483870968,
84
- "eval_loss": 0.6294359564781189,
85
- "eval_runtime": 5.4179,
86
- "eval_samples_per_second": 5.722,
87
- "eval_steps_per_second": 0.185,
88
- "step": 18
89
- },
90
- {
91
- "epoch": 8.88888888888889,
92
- "eval_accuracy": 0.7096774193548387,
93
- "eval_loss": 0.5325695872306824,
94
- "eval_runtime": 5.4264,
95
- "eval_samples_per_second": 5.713,
96
- "eval_steps_per_second": 0.184,
97
- "step": 20
98
- },
99
- {
100
- "epoch": 9.777777777777779,
101
- "eval_accuracy": 0.7419354838709677,
102
- "eval_loss": 0.48480212688446045,
103
- "eval_runtime": 5.432,
104
- "eval_samples_per_second": 5.707,
105
- "eval_steps_per_second": 0.184,
106
- "step": 22
107
- },
108
- {
109
- "epoch": 10.666666666666666,
110
- "eval_accuracy": 0.7741935483870968,
111
- "eval_loss": 0.4832201302051544,
112
- "eval_runtime": 5.482,
113
- "eval_samples_per_second": 5.655,
114
- "eval_steps_per_second": 0.182,
115
- "step": 24
116
- },
117
- {
118
- "epoch": 12.0,
119
- "eval_accuracy": 0.7741935483870968,
120
- "eval_loss": 0.44829437136650085,
121
- "eval_runtime": 5.3703,
122
- "eval_samples_per_second": 5.772,
123
- "eval_steps_per_second": 0.186,
124
- "step": 27
125
- },
126
- {
127
- "epoch": 12.88888888888889,
128
- "eval_accuracy": 0.7741935483870968,
129
- "eval_loss": 0.4296128451824188,
130
- "eval_runtime": 5.4565,
131
- "eval_samples_per_second": 5.681,
132
- "eval_steps_per_second": 0.183,
133
- "step": 29
134
- },
135
- {
136
- "epoch": 13.333333333333334,
137
- "grad_norm": 5.256907939910889,
138
- "learning_rate": 2.777777777777778e-05,
139
- "loss": 0.5925,
140
- "step": 30
141
  },
142
  {
143
- "epoch": 13.777777777777779,
144
- "eval_accuracy": 0.7741935483870968,
145
- "eval_loss": 0.40228280425071716,
146
- "eval_runtime": 5.3947,
147
- "eval_samples_per_second": 5.746,
148
- "eval_steps_per_second": 0.185,
149
- "step": 31
150
- },
151
- {
152
- "epoch": 14.666666666666666,
153
- "eval_accuracy": 0.8387096774193549,
154
- "eval_loss": 0.4110867977142334,
155
- "eval_runtime": 5.4946,
156
- "eval_samples_per_second": 5.642,
157
- "eval_steps_per_second": 0.182,
158
- "step": 33
159
- },
160
- {
161
- "epoch": 16.0,
162
- "eval_accuracy": 0.8064516129032258,
163
- "eval_loss": 0.387315571308136,
164
- "eval_runtime": 5.4222,
165
- "eval_samples_per_second": 5.717,
166
- "eval_steps_per_second": 0.184,
167
- "step": 36
168
- },
169
- {
170
- "epoch": 16.88888888888889,
171
- "eval_accuracy": 0.8064516129032258,
172
- "eval_loss": 0.4028545618057251,
173
- "eval_runtime": 5.4659,
174
- "eval_samples_per_second": 5.672,
175
- "eval_steps_per_second": 0.183,
176
- "step": 38
177
- },
178
- {
179
- "epoch": 17.77777777777778,
180
- "eval_accuracy": 0.8064516129032258,
181
- "eval_loss": 0.4065493047237396,
182
- "eval_runtime": 5.394,
183
- "eval_samples_per_second": 5.747,
184
- "eval_steps_per_second": 0.185,
185
- "step": 40
186
- },
187
- {
188
- "epoch": 18.666666666666668,
189
- "eval_accuracy": 0.8064516129032258,
190
- "eval_loss": 0.38641268014907837,
191
- "eval_runtime": 5.4499,
192
- "eval_samples_per_second": 5.688,
193
- "eval_steps_per_second": 0.183,
194
- "step": 42
195
- },
196
- {
197
- "epoch": 20.0,
198
- "grad_norm": 5.7712812423706055,
199
- "learning_rate": 1.388888888888889e-05,
200
- "loss": 0.3285,
201
- "step": 45
202
  },
203
  {
204
- "epoch": 20.0,
205
- "eval_accuracy": 0.8709677419354839,
206
- "eval_loss": 0.39681780338287354,
207
- "eval_runtime": 5.3981,
208
- "eval_samples_per_second": 5.743,
209
- "eval_steps_per_second": 0.185,
210
- "step": 45
211
  },
212
  {
213
- "epoch": 20.88888888888889,
214
- "eval_accuracy": 0.8709677419354839,
215
- "eval_loss": 0.3929939270019531,
216
- "eval_runtime": 5.3507,
217
- "eval_samples_per_second": 5.794,
218
- "eval_steps_per_second": 0.187,
219
- "step": 47
220
- },
221
- {
222
- "epoch": 21.77777777777778,
223
- "eval_accuracy": 0.8709677419354839,
224
- "eval_loss": 0.3871462643146515,
225
- "eval_runtime": 5.3376,
226
- "eval_samples_per_second": 5.808,
227
- "eval_steps_per_second": 0.187,
228
- "step": 49
229
- },
230
- {
231
- "epoch": 22.666666666666668,
232
- "eval_accuracy": 0.8064516129032258,
233
- "eval_loss": 0.3779211640357971,
234
- "eval_runtime": 5.365,
235
- "eval_samples_per_second": 5.778,
236
- "eval_steps_per_second": 0.186,
237
- "step": 51
238
- },
239
- {
240
- "epoch": 24.0,
241
- "eval_accuracy": 0.8064516129032258,
242
- "eval_loss": 0.36982351541519165,
243
- "eval_runtime": 5.3833,
244
- "eval_samples_per_second": 5.759,
245
- "eval_steps_per_second": 0.186,
246
- "step": 54
247
- },
248
- {
249
- "epoch": 24.88888888888889,
250
- "eval_accuracy": 0.8387096774193549,
251
- "eval_loss": 0.37262630462646484,
252
- "eval_runtime": 5.3782,
253
- "eval_samples_per_second": 5.764,
254
- "eval_steps_per_second": 0.186,
255
- "step": 56
256
- },
257
- {
258
- "epoch": 25.77777777777778,
259
- "eval_accuracy": 0.8387096774193549,
260
- "eval_loss": 0.3732232451438904,
261
- "eval_runtime": 5.4396,
262
- "eval_samples_per_second": 5.699,
263
- "eval_steps_per_second": 0.184,
264
- "step": 58
265
- },
266
- {
267
- "epoch": 26.666666666666668,
268
- "grad_norm": 3.0856940746307373,
269
- "learning_rate": 0.0,
270
- "loss": 0.2621,
271
- "step": 60
272
  },
273
  {
274
- "epoch": 26.666666666666668,
275
- "eval_accuracy": 0.8387096774193549,
276
- "eval_loss": 0.3731651306152344,
277
- "eval_runtime": 6.0663,
278
- "eval_samples_per_second": 5.11,
279
- "eval_steps_per_second": 0.165,
280
- "step": 60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
  },
282
  {
283
- "epoch": 26.666666666666668,
284
- "step": 60,
285
- "total_flos": 3.811843351809884e+17,
286
- "train_loss": 0.6624618768692017,
287
- "train_runtime": 3041.8152,
288
- "train_samples_per_second": 5.671,
289
- "train_steps_per_second": 0.02
290
  }
291
  ],
292
  "logging_steps": 15,
293
- "max_steps": 60,
294
  "num_input_tokens_seen": 0,
295
- "num_train_epochs": 30,
296
  "save_steps": 500,
297
  "stateful_callbacks": {
298
  "TrainerControl": {
@@ -306,7 +419,7 @@
306
  "attributes": {}
307
  }
308
  },
309
- "total_flos": 3.811843351809884e+17,
310
  "train_batch_size": 64,
311
  "trial_name": null,
312
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.8257839721254355,
3
+ "best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-mobile-eye-tracking-dataset-v2/checkpoint-640",
4
+ "epoch": 10.0,
5
  "eval_steps": 500,
6
+ "global_step": 640,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.234375,
13
+ "grad_norm": 5.875625133514404,
14
+ "learning_rate": 2.3437500000000002e-06,
15
+ "loss": 2.3858,
16
+ "step": 15
17
+ },
18
+ {
19
+ "epoch": 0.46875,
20
+ "grad_norm": 4.278784275054932,
21
+ "learning_rate": 4.6875000000000004e-06,
22
+ "loss": 2.2837,
23
+ "step": 30
24
+ },
25
+ {
26
+ "epoch": 0.703125,
27
+ "grad_norm": 4.786104679107666,
28
+ "learning_rate": 7.031250000000001e-06,
29
+ "loss": 2.1191,
30
+ "step": 45
31
+ },
32
+ {
33
+ "epoch": 0.9375,
34
+ "grad_norm": 4.474501132965088,
35
+ "learning_rate": 9.375000000000001e-06,
36
+ "loss": 1.8826,
37
+ "step": 60
38
+ },
39
+ {
40
+ "epoch": 1.0,
41
+ "eval_accuracy": 0.46689895470383275,
42
+ "eval_loss": 1.5673099756240845,
43
+ "eval_runtime": 9.0893,
44
+ "eval_samples_per_second": 94.727,
45
+ "eval_steps_per_second": 1.54,
46
+ "step": 64
47
+ },
48
+ {
49
+ "epoch": 1.171875,
50
+ "grad_norm": 4.6485514640808105,
51
+ "learning_rate": 9.80902777777778e-06,
52
+ "loss": 1.596,
53
+ "step": 75
54
+ },
55
+ {
56
+ "epoch": 1.40625,
57
+ "grad_norm": 5.915872573852539,
58
+ "learning_rate": 9.548611111111113e-06,
59
+ "loss": 1.3478,
60
+ "step": 90
61
+ },
62
+ {
63
+ "epoch": 1.640625,
64
+ "grad_norm": 11.166051864624023,
65
+ "learning_rate": 9.288194444444444e-06,
66
+ "loss": 1.1995,
67
+ "step": 105
68
+ },
69
+ {
70
+ "epoch": 1.875,
71
+ "grad_norm": 8.33702564239502,
72
+ "learning_rate": 9.027777777777779e-06,
73
+ "loss": 1.1123,
74
+ "step": 120
75
+ },
76
+ {
77
+ "epoch": 2.0,
78
+ "eval_accuracy": 0.7154471544715447,
79
+ "eval_loss": 0.9031155705451965,
80
+ "eval_runtime": 6.0626,
81
+ "eval_samples_per_second": 142.018,
82
+ "eval_steps_per_second": 2.309,
83
+ "step": 128
84
+ },
85
+ {
86
+ "epoch": 2.109375,
87
+ "grad_norm": 9.904069900512695,
88
+ "learning_rate": 8.767361111111112e-06,
89
+ "loss": 1.0379,
90
+ "step": 135
91
  },
92
  {
93
+ "epoch": 2.34375,
94
+ "grad_norm": 6.573493957519531,
95
+ "learning_rate": 8.506944444444445e-06,
96
+ "loss": 0.9758,
97
+ "step": 150
 
 
98
  },
99
  {
100
+ "epoch": 2.578125,
101
+ "grad_norm": 7.493569374084473,
102
+ "learning_rate": 8.246527777777779e-06,
103
+ "loss": 0.9458,
104
+ "step": 165
105
+ },
106
+ {
107
+ "epoch": 2.8125,
108
+ "grad_norm": 8.070711135864258,
109
+ "learning_rate": 7.986111111111112e-06,
110
+ "loss": 0.8883,
111
+ "step": 180
112
+ },
113
+ {
114
+ "epoch": 3.0,
115
+ "eval_accuracy": 0.7572590011614402,
116
+ "eval_loss": 0.7254553437232971,
117
+ "eval_runtime": 5.0,
118
+ "eval_samples_per_second": 172.198,
119
+ "eval_steps_per_second": 2.8,
120
+ "step": 192
121
+ },
122
+ {
123
+ "epoch": 3.046875,
124
+ "grad_norm": 6.530824661254883,
125
+ "learning_rate": 7.725694444444445e-06,
126
+ "loss": 0.8797,
127
+ "step": 195
128
+ },
129
+ {
130
+ "epoch": 3.28125,
131
+ "grad_norm": 4.865982532501221,
132
+ "learning_rate": 7.465277777777778e-06,
133
+ "loss": 0.8543,
134
+ "step": 210
135
+ },
136
+ {
137
+ "epoch": 3.515625,
138
+ "grad_norm": 8.852520942687988,
139
+ "learning_rate": 7.204861111111112e-06,
140
+ "loss": 0.8026,
141
+ "step": 225
142
+ },
143
+ {
144
+ "epoch": 3.75,
145
+ "grad_norm": 10.359617233276367,
146
+ "learning_rate": 6.944444444444445e-06,
147
+ "loss": 0.7792,
148
+ "step": 240
149
+ },
150
+ {
151
+ "epoch": 3.984375,
152
+ "grad_norm": 8.156173706054688,
153
+ "learning_rate": 6.684027777777779e-06,
154
+ "loss": 0.7778,
155
+ "step": 255
156
  },
157
  {
158
  "epoch": 4.0,
159
+ "eval_accuracy": 0.7793263646922184,
160
+ "eval_loss": 0.621900737285614,
161
+ "eval_runtime": 4.9966,
162
+ "eval_samples_per_second": 172.316,
163
+ "eval_steps_per_second": 2.802,
164
+ "step": 256
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  },
166
  {
167
+ "epoch": 4.21875,
168
+ "grad_norm": 7.673396587371826,
169
+ "learning_rate": 6.423611111111112e-06,
170
+ "loss": 0.7533,
171
+ "step": 270
172
+ },
173
+ {
174
+ "epoch": 4.453125,
175
+ "grad_norm": 7.9218645095825195,
176
+ "learning_rate": 6.163194444444444e-06,
177
+ "loss": 0.7165,
178
+ "step": 285
179
+ },
180
+ {
181
+ "epoch": 4.6875,
182
+ "grad_norm": 5.629858493804932,
183
+ "learning_rate": 5.9027777777777785e-06,
184
+ "loss": 0.7092,
185
+ "step": 300
186
+ },
187
+ {
188
+ "epoch": 4.921875,
189
+ "grad_norm": 9.611222267150879,
190
+ "learning_rate": 5.642361111111112e-06,
191
+ "loss": 0.708,
192
+ "step": 315
193
+ },
194
+ {
195
+ "epoch": 5.0,
196
+ "eval_accuracy": 0.8002322880371661,
197
+ "eval_loss": 0.5520657896995544,
198
+ "eval_runtime": 4.9527,
199
+ "eval_samples_per_second": 173.845,
200
+ "eval_steps_per_second": 2.827,
201
+ "step": 320
202
+ },
203
+ {
204
+ "epoch": 5.15625,
205
+ "grad_norm": 8.07551383972168,
206
+ "learning_rate": 5.381944444444445e-06,
207
+ "loss": 0.6842,
208
+ "step": 330
209
+ },
210
+ {
211
+ "epoch": 5.390625,
212
+ "grad_norm": 7.263919830322266,
213
+ "learning_rate": 5.121527777777778e-06,
214
+ "loss": 0.6724,
215
+ "step": 345
216
+ },
217
+ {
218
+ "epoch": 5.625,
219
+ "grad_norm": 6.807614326477051,
220
+ "learning_rate": 4.861111111111111e-06,
221
+ "loss": 0.6703,
222
+ "step": 360
223
+ },
224
+ {
225
+ "epoch": 5.859375,
226
+ "grad_norm": 6.808135509490967,
227
+ "learning_rate": 4.6006944444444446e-06,
228
+ "loss": 0.6308,
229
+ "step": 375
230
+ },
231
+ {
232
+ "epoch": 6.0,
233
+ "eval_accuracy": 0.8130081300813008,
234
+ "eval_loss": 0.5193033218383789,
235
+ "eval_runtime": 5.1321,
236
+ "eval_samples_per_second": 167.766,
237
+ "eval_steps_per_second": 2.728,
238
+ "step": 384
239
+ },
240
+ {
241
+ "epoch": 6.09375,
242
+ "grad_norm": 6.225130081176758,
243
+ "learning_rate": 4.340277777777779e-06,
244
+ "loss": 0.6442,
245
+ "step": 390
246
+ },
247
+ {
248
+ "epoch": 6.328125,
249
+ "grad_norm": 9.285326957702637,
250
+ "learning_rate": 4.079861111111111e-06,
251
+ "loss": 0.6139,
252
+ "step": 405
253
+ },
254
+ {
255
+ "epoch": 6.5625,
256
+ "grad_norm": 7.4434356689453125,
257
+ "learning_rate": 3.819444444444444e-06,
258
+ "loss": 0.6164,
259
+ "step": 420
260
+ },
261
+ {
262
+ "epoch": 6.796875,
263
+ "grad_norm": 7.437946796417236,
264
+ "learning_rate": 3.5590277777777783e-06,
265
+ "loss": 0.6142,
266
+ "step": 435
267
+ },
268
+ {
269
+ "epoch": 7.0,
270
+ "eval_accuracy": 0.8234610917537747,
271
+ "eval_loss": 0.4853927195072174,
272
+ "eval_runtime": 5.3739,
273
+ "eval_samples_per_second": 160.219,
274
+ "eval_steps_per_second": 2.605,
275
+ "step": 448
276
+ },
277
+ {
278
+ "epoch": 7.03125,
279
+ "grad_norm": 8.036016464233398,
280
+ "learning_rate": 3.2986111111111115e-06,
281
+ "loss": 0.6178,
282
+ "step": 450
283
+ },
284
+ {
285
+ "epoch": 7.265625,
286
+ "grad_norm": 6.326539039611816,
287
+ "learning_rate": 3.0381944444444443e-06,
288
+ "loss": 0.6076,
289
+ "step": 465
290
+ },
291
+ {
292
+ "epoch": 7.5,
293
+ "grad_norm": 6.9872236251831055,
294
+ "learning_rate": 2.7777777777777783e-06,
295
+ "loss": 0.5688,
296
+ "step": 480
297
+ },
298
+ {
299
+ "epoch": 7.734375,
300
+ "grad_norm": 7.621065139770508,
301
+ "learning_rate": 2.517361111111111e-06,
302
+ "loss": 0.6034,
303
+ "step": 495
304
+ },
305
+ {
306
+ "epoch": 7.96875,
307
+ "grad_norm": 7.739367485046387,
308
+ "learning_rate": 2.2569444444444448e-06,
309
+ "loss": 0.5817,
310
+ "step": 510
311
  },
312
  {
313
  "epoch": 8.0,
314
+ "eval_accuracy": 0.8199767711962834,
315
+ "eval_loss": 0.472567081451416,
316
+ "eval_runtime": 5.1622,
317
+ "eval_samples_per_second": 166.788,
318
+ "eval_steps_per_second": 2.712,
319
+ "step": 512
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
  },
321
  {
322
+ "epoch": 8.203125,
323
+ "grad_norm": 7.276366710662842,
324
+ "learning_rate": 1.996527777777778e-06,
325
+ "loss": 0.5682,
326
+ "step": 525
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
  },
328
  {
329
+ "epoch": 8.4375,
330
+ "grad_norm": 9.095114707946777,
331
+ "learning_rate": 1.7361111111111112e-06,
332
+ "loss": 0.5797,
333
+ "step": 540
 
 
334
  },
335
  {
336
+ "epoch": 8.671875,
337
+ "grad_norm": 7.369579792022705,
338
+ "learning_rate": 1.4756944444444446e-06,
339
+ "loss": 0.5732,
340
+ "step": 555
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
  },
342
  {
343
+ "epoch": 8.90625,
344
+ "grad_norm": 6.826248645782471,
345
+ "learning_rate": 1.2152777777777778e-06,
346
+ "loss": 0.5952,
347
+ "step": 570
348
+ },
349
+ {
350
+ "epoch": 9.0,
351
+ "eval_accuracy": 0.8211382113821138,
352
+ "eval_loss": 0.46478894352912903,
353
+ "eval_runtime": 5.0463,
354
+ "eval_samples_per_second": 170.621,
355
+ "eval_steps_per_second": 2.774,
356
+ "step": 576
357
+ },
358
+ {
359
+ "epoch": 9.140625,
360
+ "grad_norm": 12.532827377319336,
361
+ "learning_rate": 9.54861111111111e-07,
362
+ "loss": 0.5849,
363
+ "step": 585
364
+ },
365
+ {
366
+ "epoch": 9.375,
367
+ "grad_norm": 8.84656047821045,
368
+ "learning_rate": 6.944444444444446e-07,
369
+ "loss": 0.583,
370
+ "step": 600
371
+ },
372
+ {
373
+ "epoch": 9.609375,
374
+ "grad_norm": 9.143095970153809,
375
+ "learning_rate": 4.340277777777778e-07,
376
+ "loss": 0.5599,
377
+ "step": 615
378
+ },
379
+ {
380
+ "epoch": 9.84375,
381
+ "grad_norm": 7.61570930480957,
382
+ "learning_rate": 1.7361111111111115e-07,
383
+ "loss": 0.5915,
384
+ "step": 630
385
+ },
386
+ {
387
+ "epoch": 10.0,
388
+ "eval_accuracy": 0.8257839721254355,
389
+ "eval_loss": 0.4613528847694397,
390
+ "eval_runtime": 5.2149,
391
+ "eval_samples_per_second": 165.103,
392
+ "eval_steps_per_second": 2.685,
393
+ "step": 640
394
  },
395
  {
396
+ "epoch": 10.0,
397
+ "step": 640,
398
+ "total_flos": 4.064614037073838e+18,
399
+ "train_loss": 0.8882290616631507,
400
+ "train_runtime": 1944.8842,
401
+ "train_samples_per_second": 84.062,
402
+ "train_steps_per_second": 0.329
403
  }
404
  ],
405
  "logging_steps": 15,
406
+ "max_steps": 640,
407
  "num_input_tokens_seen": 0,
408
+ "num_train_epochs": 10,
409
  "save_steps": 500,
410
  "stateful_callbacks": {
411
  "TrainerControl": {
 
419
  "attributes": {}
420
  }
421
  },
422
+ "total_flos": 4.064614037073838e+18,
423
  "train_batch_size": 64,
424
  "trial_name": null,
425
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cbb0109880a2498c03f2d417f14abb40ba113e6fea09d08d99c53204fcd580aa
3
  size 5304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e757de8f4fdb59c52b6ad3219481d0b6bb84b344ae9ca28d91a978cac817f0ea
3
  size 5304