Gizachew commited on
Commit
3d42703
1 Parent(s): 5aea56b

End of training

Browse files
README.md CHANGED
@@ -13,13 +13,12 @@ model-index:
13
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
14
  should probably proofread and complete it, then remove this comment. -->
15
 
16
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/hallo23/huggingface/runs/8oefwxr0)
17
  # ckpts
18
 
19
  This model is a fine-tuned version of [facebook/hubert-base-ls960](https://huggingface.co/facebook/hubert-base-ls960) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 0.2441
22
- - Accuracy: 0.9394
23
 
24
  ## Model description
25
 
@@ -51,18 +50,24 @@ The following hyperparameters were used during training:
51
 
52
  ### Training results
53
 
54
- | Training Loss | Epoch | Step | Validation Loss | Accuracy |
55
- |:-------------:|:-------:|:----:|:---------------:|:--------:|
56
- | 0.4948 | 2.2422 | 500 | 0.3149 | 0.9091 |
57
- | 0.2091 | 4.4843 | 1000 | 0.2441 | 0.9394 |
58
- | 0.1252 | 6.7265 | 1500 | 0.2574 | 0.9495 |
59
- | 0.1459 | 8.9686 | 2000 | 0.2598 | 0.9495 |
60
- | 0.1062 | 11.2108 | 2500 | 0.2850 | 0.9495 |
 
 
 
 
 
 
61
 
62
 
63
  ### Framework versions
64
 
65
- - Transformers 4.41.0.dev0
66
  - Pytorch 2.1.2
67
- - Datasets 2.19.1.dev0
68
- - Tokenizers 0.19.1
 
13
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
14
  should probably proofread and complete it, then remove this comment. -->
15
 
 
16
  # ckpts
17
 
18
  This model is a fine-tuned version of [facebook/hubert-base-ls960](https://huggingface.co/facebook/hubert-base-ls960) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 0.3189
21
+ - Accuracy: 0.9444
22
 
23
  ## Model description
24
 
 
50
 
51
  ### Training results
52
 
53
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
54
+ |:-------------:|:-----:|:----:|:---------------:|:--------:|
55
+ | 0.7722 | 1.0 | 223 | 0.4733 | 0.8434 |
56
+ | 0.4755 | 2.0 | 446 | 0.4240 | 0.8687 |
57
+ | 0.3262 | 3.0 | 669 | 0.2939 | 0.9343 |
58
+ | 0.2642 | 4.0 | 892 | 0.3087 | 0.9293 |
59
+ | 0.191 | 5.0 | 1115 | 0.3079 | 0.9394 |
60
+ | 0.1534 | 6.0 | 1338 | 0.3134 | 0.9394 |
61
+ | 0.1571 | 7.0 | 1561 | 0.4009 | 0.9293 |
62
+ | 0.1328 | 8.0 | 1784 | 0.3189 | 0.9444 |
63
+ | 0.1567 | 9.0 | 2007 | 0.4089 | 0.9192 |
64
+ | 0.1043 | 10.0 | 2230 | 0.3429 | 0.9343 |
65
+ | 0.1161 | 11.0 | 2453 | 0.3534 | 0.9394 |
66
 
67
 
68
  ### Framework versions
69
 
70
+ - Transformers 4.39.3
71
  - Pytorch 2.1.2
72
+ - Datasets 2.18.0
73
+ - Tokenizers 0.15.2
all_results.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
- "epoch": 11.210762331838565,
3
- "eval_accuracy": 0.939393937587738,
4
- "eval_loss": 0.2440650463104248,
5
- "eval_runtime": 9.1551,
6
- "eval_samples_per_second": 21.627,
7
- "eval_steps_per_second": 5.461,
8
- "total_flos": 6.392309759902944e+17,
9
- "train_loss": 0.3350444522857666,
10
- "train_runtime": 1353.7938,
11
  "train_samples": 1781,
12
- "train_samples_per_second": 19.733,
13
- "train_steps_per_second": 2.471
14
  }
 
1
  {
2
+ "epoch": 11.0,
3
+ "eval_accuracy": 0.9444444179534912,
4
+ "eval_loss": 0.3189202845096588,
5
+ "eval_runtime": 9.0656,
6
+ "eval_samples_per_second": 21.841,
7
+ "eval_steps_per_second": 5.515,
8
+ "total_flos": 6.273609670944864e+17,
9
+ "train_loss": 0.2735439113728912,
10
+ "train_runtime": 1317.5896,
11
  "train_samples": 1781,
12
+ "train_samples_per_second": 20.276,
13
+ "train_steps_per_second": 2.539
14
  }
config.json CHANGED
@@ -87,7 +87,7 @@
87
  "problem_type": "single_label_classification",
88
  "tokenizer_class": "Wav2Vec2CTCTokenizer",
89
  "torch_dtype": "float32",
90
- "transformers_version": "4.41.0.dev0",
91
  "use_weighted_layer_sum": false,
92
  "vocab_size": 32
93
  }
 
87
  "problem_type": "single_label_classification",
88
  "tokenizer_class": "Wav2Vec2CTCTokenizer",
89
  "torch_dtype": "float32",
90
+ "transformers_version": "4.39.3",
91
  "use_weighted_layer_sum": false,
92
  "vocab_size": 32
93
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 11.210762331838565,
3
- "eval_accuracy": 0.939393937587738,
4
- "eval_loss": 0.2440650463104248,
5
- "eval_runtime": 9.1551,
6
- "eval_samples_per_second": 21.627,
7
- "eval_steps_per_second": 5.461
8
  }
 
1
  {
2
+ "epoch": 11.0,
3
+ "eval_accuracy": 0.9444444179534912,
4
+ "eval_loss": 0.3189202845096588,
5
+ "eval_runtime": 9.0656,
6
+ "eval_samples_per_second": 21.841,
7
+ "eval_steps_per_second": 5.515
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8a20a752159837c75f2cc88929592ca3273e30623d416393032de66165047cf8
3
  size 379890236
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9ca5e862fb246d7eaa7bfe0560a4177c3c1b4da82ae4c7765071e7b5f8402b5
3
  size 379890236
runs/Apr23_01-28-16_6ef5debac42d/events.out.tfevents.1713835697.6ef5debac42d.34.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a9f9a84df5d0b7a943fd4c6f03184e98145fcb3e1adce3028fa5b9099aed8e9
3
+ size 6016
runs/Apr23_02-26-48_6ef5debac42d/events.out.tfevents.1713839214.6ef5debac42d.34.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eed67d284e1a49e7d9efe29567c6844f19210307f0b2b762e32a701f57d060dd
3
+ size 14829
runs/Apr23_02-26-48_6ef5debac42d/events.out.tfevents.1713840541.6ef5debac42d.34.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd6dc00b487f29345c8a919953fe1d96b8ebfa234649cae5634dd52b0393bd35
3
+ size 734
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 11.210762331838565,
3
- "total_flos": 6.392309759902944e+17,
4
- "train_loss": 0.3350444522857666,
5
- "train_runtime": 1353.7938,
6
  "train_samples": 1781,
7
- "train_samples_per_second": 19.733,
8
- "train_steps_per_second": 2.471
9
  }
 
1
  {
2
+ "epoch": 11.0,
3
+ "total_flos": 6.273609670944864e+17,
4
+ "train_loss": 0.2735439113728912,
5
+ "train_runtime": 1317.5896,
6
  "train_samples": 1781,
7
+ "train_samples_per_second": 20.276,
8
+ "train_steps_per_second": 2.539
9
  }
trainer_state.json CHANGED
@@ -1,250 +1,297 @@
1
  {
2
- "best_metric": 0.2440650463104248,
3
- "best_model_checkpoint": "/kaggle/working/ckpts/checkpoint-1000",
4
- "epoch": 11.210762331838565,
5
  "eval_steps": 500,
6
- "global_step": 2500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.4484304932735426,
13
- "grad_norm": 2.7230775356292725,
14
  "learning_rate": 9.701046337817639e-06,
15
- "loss": 1.5364,
16
  "step": 100
17
  },
18
  {
19
- "epoch": 0.8968609865470852,
20
- "grad_norm": 3.450390577316284,
21
- "learning_rate": 9.402092675635277e-06,
22
- "loss": 1.1843,
23
  "step": 200
24
  },
25
  {
26
- "epoch": 1.3452914798206277,
27
- "grad_norm": 6.235444068908691,
 
 
 
 
 
 
 
 
 
28
  "learning_rate": 9.106128550074738e-06,
29
- "loss": 0.8509,
30
  "step": 300
31
  },
32
  {
33
- "epoch": 1.7937219730941703,
34
- "grad_norm": 3.819420099258423,
35
  "learning_rate": 8.807174887892378e-06,
36
- "loss": 0.619,
37
  "step": 400
38
  },
39
  {
40
- "epoch": 2.242152466367713,
41
- "grad_norm": 3.2721917629241943,
42
- "learning_rate": 8.51121076233184e-06,
43
- "loss": 0.4948,
44
- "step": 500
 
 
45
  },
46
  {
47
- "epoch": 2.242152466367713,
48
- "eval_accuracy": 0.9090909361839294,
49
- "eval_loss": 0.31492409110069275,
50
- "eval_runtime": 9.2418,
51
- "eval_samples_per_second": 21.424,
52
- "eval_steps_per_second": 5.41,
53
  "step": 500
54
  },
55
  {
56
- "epoch": 2.6905829596412554,
57
- "grad_norm": 2.11232328414917,
58
- "learning_rate": 8.212257100149478e-06,
59
- "loss": 0.3709,
60
  "step": 600
61
  },
62
  {
63
- "epoch": 3.1390134529147984,
64
- "grad_norm": 8.740654945373535,
65
- "learning_rate": 7.913303437967116e-06,
66
- "loss": 0.3427,
 
 
 
 
 
 
 
 
 
67
  "step": 700
68
  },
69
  {
70
- "epoch": 3.587443946188341,
71
- "grad_norm": 18.510692596435547,
72
  "learning_rate": 7.617339312406578e-06,
73
- "loss": 0.2957,
74
  "step": 800
75
  },
76
  {
77
- "epoch": 4.0358744394618835,
78
- "grad_norm": 54.879703521728516,
 
 
 
 
 
 
 
 
 
79
  "learning_rate": 7.318385650224216e-06,
80
- "loss": 0.3187,
81
  "step": 900
82
  },
83
  {
84
- "epoch": 4.484304932735426,
85
- "grad_norm": 25.553455352783203,
86
  "learning_rate": 7.019431988041854e-06,
87
- "loss": 0.2091,
88
  "step": 1000
89
  },
90
  {
91
- "epoch": 4.484304932735426,
92
- "eval_accuracy": 0.939393937587738,
93
- "eval_loss": 0.2440650463104248,
94
- "eval_runtime": 9.3073,
95
- "eval_samples_per_second": 21.274,
96
- "eval_steps_per_second": 5.372,
97
- "step": 1000
98
  },
99
  {
100
- "epoch": 4.932735426008969,
101
- "grad_norm": 18.517961502075195,
102
- "learning_rate": 6.720478325859492e-06,
103
- "loss": 0.1786,
104
- "step": 1100
 
 
105
  },
106
  {
107
- "epoch": 5.381165919282511,
108
- "grad_norm": 18.12173080444336,
109
- "learning_rate": 6.421524663677131e-06,
110
- "loss": 0.2242,
111
  "step": 1200
112
  },
113
  {
114
- "epoch": 5.829596412556054,
115
- "grad_norm": 12.17724895477295,
116
- "learning_rate": 6.1225710014947695e-06,
117
- "loss": 0.1743,
118
  "step": 1300
119
  },
120
  {
121
- "epoch": 6.278026905829597,
122
- "grad_norm": 0.7298703789710999,
123
- "learning_rate": 5.823617339312408e-06,
124
- "loss": 0.1864,
 
 
 
 
 
 
 
 
 
125
  "step": 1400
126
  },
127
  {
128
- "epoch": 6.726457399103139,
129
- "grad_norm": 0.3917248845100403,
130
- "learning_rate": 5.524663677130046e-06,
131
- "loss": 0.1252,
132
  "step": 1500
133
  },
134
  {
135
- "epoch": 6.726457399103139,
136
- "eval_accuracy": 0.9494949579238892,
137
- "eval_loss": 0.2574491798877716,
138
- "eval_runtime": 9.3365,
139
- "eval_samples_per_second": 21.207,
140
- "eval_steps_per_second": 5.355,
141
- "step": 1500
142
  },
143
  {
144
- "epoch": 7.174887892376682,
145
- "grad_norm": 0.04727130010724068,
146
- "learning_rate": 5.225710014947683e-06,
147
- "loss": 0.1388,
148
  "step": 1600
149
  },
150
  {
151
- "epoch": 7.623318385650224,
152
- "grad_norm": 0.06415404379367828,
153
  "learning_rate": 4.929745889387145e-06,
154
- "loss": 0.1366,
155
  "step": 1700
156
  },
157
  {
158
- "epoch": 8.071748878923767,
159
- "grad_norm": 14.02224349975586,
 
 
 
 
 
 
 
 
 
160
  "learning_rate": 4.630792227204783e-06,
161
- "loss": 0.156,
162
  "step": 1800
163
  },
164
  {
165
- "epoch": 8.52017937219731,
166
- "grad_norm": 0.037808869034051895,
167
  "learning_rate": 4.3318385650224224e-06,
168
- "loss": 0.1297,
169
  "step": 1900
170
  },
171
  {
172
- "epoch": 8.968609865470851,
173
- "grad_norm": 41.554500579833984,
174
  "learning_rate": 4.03288490284006e-06,
175
- "loss": 0.1459,
176
  "step": 2000
177
  },
178
  {
179
- "epoch": 8.968609865470851,
180
- "eval_accuracy": 0.9494949579238892,
181
- "eval_loss": 0.2597883343696594,
182
- "eval_runtime": 9.3051,
183
- "eval_samples_per_second": 21.279,
184
- "eval_steps_per_second": 5.373,
185
- "step": 2000
186
  },
187
  {
188
- "epoch": 9.417040358744394,
189
- "grad_norm": 0.05968519300222397,
190
  "learning_rate": 3.7339312406576984e-06,
191
- "loss": 0.1264,
192
  "step": 2100
193
  },
194
  {
195
- "epoch": 9.865470852017937,
196
- "grad_norm": 0.05496314913034439,
197
  "learning_rate": 3.4349775784753366e-06,
198
- "loss": 0.0953,
199
  "step": 2200
200
  },
201
  {
202
- "epoch": 10.31390134529148,
203
- "grad_norm": 0.08967319875955582,
 
 
 
 
 
 
 
 
 
204
  "learning_rate": 3.136023916292975e-06,
205
- "loss": 0.1124,
206
  "step": 2300
207
  },
208
  {
209
- "epoch": 10.762331838565022,
210
- "grad_norm": 0.25198718905448914,
211
  "learning_rate": 2.8370702541106134e-06,
212
- "loss": 0.1178,
213
  "step": 2400
214
  },
215
  {
216
- "epoch": 11.210762331838565,
217
- "grad_norm": 26.92681884765625,
218
- "learning_rate": 2.538116591928251e-06,
219
- "loss": 0.1062,
220
- "step": 2500
221
- },
222
- {
223
- "epoch": 11.210762331838565,
224
- "eval_accuracy": 0.9494949579238892,
225
- "eval_loss": 0.28504666686058044,
226
- "eval_runtime": 9.4727,
227
- "eval_samples_per_second": 20.902,
228
- "eval_steps_per_second": 5.278,
229
- "step": 2500
230
  },
231
  {
232
- "epoch": 11.210762331838565,
233
- "step": 2500,
234
- "total_flos": 6.392309759902944e+17,
235
- "train_loss": 0.3350444522857666,
236
- "train_runtime": 1353.7938,
237
- "train_samples_per_second": 19.733,
238
- "train_steps_per_second": 2.471
239
  },
240
  {
241
- "epoch": 11.210762331838565,
242
- "eval_accuracy": 0.939393937587738,
243
- "eval_loss": 0.2440650463104248,
244
- "eval_runtime": 9.1551,
245
- "eval_samples_per_second": 21.627,
246
- "eval_steps_per_second": 5.461,
247
- "step": 2500
248
  }
249
  ],
250
  "logging_steps": 100,
@@ -252,7 +299,7 @@
252
  "num_input_tokens_seen": 0,
253
  "num_train_epochs": 15,
254
  "save_steps": 500,
255
- "total_flos": 6.392309759902944e+17,
256
  "train_batch_size": 4,
257
  "trial_name": null,
258
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.9444444179534912,
3
+ "best_model_checkpoint": "/kaggle/working/ckpts/checkpoint-1784",
4
+ "epoch": 11.0,
5
  "eval_steps": 500,
6
+ "global_step": 2453,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.45,
13
+ "grad_norm": 3.8796565532684326,
14
  "learning_rate": 9.701046337817639e-06,
15
+ "loss": 1.0979,
16
  "step": 100
17
  },
18
  {
19
+ "epoch": 0.9,
20
+ "grad_norm": 2.6467082500457764,
21
+ "learning_rate": 9.4050822122571e-06,
22
+ "loss": 0.7722,
23
  "step": 200
24
  },
25
  {
26
+ "epoch": 1.0,
27
+ "eval_accuracy": 0.8434343338012695,
28
+ "eval_loss": 0.4733273386955261,
29
+ "eval_runtime": 9.4201,
30
+ "eval_samples_per_second": 21.019,
31
+ "eval_steps_per_second": 5.308,
32
+ "step": 223
33
+ },
34
+ {
35
+ "epoch": 1.35,
36
+ "grad_norm": 20.371318817138672,
37
  "learning_rate": 9.106128550074738e-06,
38
+ "loss": 0.5871,
39
  "step": 300
40
  },
41
  {
42
+ "epoch": 1.79,
43
+ "grad_norm": 3.244633674621582,
44
  "learning_rate": 8.807174887892378e-06,
45
+ "loss": 0.4755,
46
  "step": 400
47
  },
48
  {
49
+ "epoch": 2.0,
50
+ "eval_accuracy": 0.868686854839325,
51
+ "eval_loss": 0.42396825551986694,
52
+ "eval_runtime": 9.3161,
53
+ "eval_samples_per_second": 21.254,
54
+ "eval_steps_per_second": 5.367,
55
+ "step": 446
56
  },
57
  {
58
+ "epoch": 2.24,
59
+ "grad_norm": 8.883748054504395,
60
+ "learning_rate": 8.51121076233184e-06,
61
+ "loss": 0.377,
 
 
62
  "step": 500
63
  },
64
  {
65
+ "epoch": 2.69,
66
+ "grad_norm": 1.6546649932861328,
67
+ "learning_rate": 8.2152466367713e-06,
68
+ "loss": 0.3262,
69
  "step": 600
70
  },
71
  {
72
+ "epoch": 3.0,
73
+ "eval_accuracy": 0.9343434572219849,
74
+ "eval_loss": 0.2939111590385437,
75
+ "eval_runtime": 9.4493,
76
+ "eval_samples_per_second": 20.954,
77
+ "eval_steps_per_second": 5.291,
78
+ "step": 669
79
+ },
80
+ {
81
+ "epoch": 3.14,
82
+ "grad_norm": 7.946984767913818,
83
+ "learning_rate": 7.916292974588939e-06,
84
+ "loss": 0.2347,
85
  "step": 700
86
  },
87
  {
88
+ "epoch": 3.59,
89
+ "grad_norm": 9.960402488708496,
90
  "learning_rate": 7.617339312406578e-06,
91
+ "loss": 0.2642,
92
  "step": 800
93
  },
94
  {
95
+ "epoch": 4.0,
96
+ "eval_accuracy": 0.9292929172515869,
97
+ "eval_loss": 0.3087417781352997,
98
+ "eval_runtime": 9.224,
99
+ "eval_samples_per_second": 21.466,
100
+ "eval_steps_per_second": 5.421,
101
+ "step": 892
102
+ },
103
+ {
104
+ "epoch": 4.04,
105
+ "grad_norm": 87.66458129882812,
106
  "learning_rate": 7.318385650224216e-06,
107
+ "loss": 0.2608,
108
  "step": 900
109
  },
110
  {
111
+ "epoch": 4.48,
112
+ "grad_norm": 42.06097412109375,
113
  "learning_rate": 7.019431988041854e-06,
114
+ "loss": 0.213,
115
  "step": 1000
116
  },
117
  {
118
+ "epoch": 4.93,
119
+ "grad_norm": 21.227588653564453,
120
+ "learning_rate": 6.723467862481315e-06,
121
+ "loss": 0.191,
122
+ "step": 1100
 
 
123
  },
124
  {
125
+ "epoch": 5.0,
126
+ "eval_accuracy": 0.939393937587738,
127
+ "eval_loss": 0.30786794424057007,
128
+ "eval_runtime": 9.2259,
129
+ "eval_samples_per_second": 21.461,
130
+ "eval_steps_per_second": 5.42,
131
+ "step": 1115
132
  },
133
  {
134
+ "epoch": 5.38,
135
+ "grad_norm": 0.09492979198694229,
136
+ "learning_rate": 6.424514200298954e-06,
137
+ "loss": 0.1891,
138
  "step": 1200
139
  },
140
  {
141
+ "epoch": 5.83,
142
+ "grad_norm": 22.492895126342773,
143
+ "learning_rate": 6.1255605381165925e-06,
144
+ "loss": 0.1534,
145
  "step": 1300
146
  },
147
  {
148
+ "epoch": 6.0,
149
+ "eval_accuracy": 0.939393937587738,
150
+ "eval_loss": 0.3133719265460968,
151
+ "eval_runtime": 9.3193,
152
+ "eval_samples_per_second": 21.246,
153
+ "eval_steps_per_second": 5.365,
154
+ "step": 1338
155
+ },
156
+ {
157
+ "epoch": 6.28,
158
+ "grad_norm": 0.05382364243268967,
159
+ "learning_rate": 5.826606875934231e-06,
160
+ "loss": 0.1825,
161
  "step": 1400
162
  },
163
  {
164
+ "epoch": 6.73,
165
+ "grad_norm": 5.18447732925415,
166
+ "learning_rate": 5.527653213751869e-06,
167
+ "loss": 0.1571,
168
  "step": 1500
169
  },
170
  {
171
+ "epoch": 7.0,
172
+ "eval_accuracy": 0.9292929172515869,
173
+ "eval_loss": 0.40089717507362366,
174
+ "eval_runtime": 9.2909,
175
+ "eval_samples_per_second": 21.311,
176
+ "eval_steps_per_second": 5.382,
177
+ "step": 1561
178
  },
179
  {
180
+ "epoch": 7.17,
181
+ "grad_norm": 0.036003902554512024,
182
+ "learning_rate": 5.228699551569507e-06,
183
+ "loss": 0.1518,
184
  "step": 1600
185
  },
186
  {
187
+ "epoch": 7.62,
188
+ "grad_norm": 0.10409737378358841,
189
  "learning_rate": 4.929745889387145e-06,
190
+ "loss": 0.1328,
191
  "step": 1700
192
  },
193
  {
194
+ "epoch": 8.0,
195
+ "eval_accuracy": 0.9444444179534912,
196
+ "eval_loss": 0.3189202845096588,
197
+ "eval_runtime": 9.3287,
198
+ "eval_samples_per_second": 21.225,
199
+ "eval_steps_per_second": 5.36,
200
+ "step": 1784
201
+ },
202
+ {
203
+ "epoch": 8.07,
204
+ "grad_norm": 6.580456733703613,
205
  "learning_rate": 4.630792227204783e-06,
206
+ "loss": 0.1127,
207
  "step": 1800
208
  },
209
  {
210
+ "epoch": 8.52,
211
+ "grad_norm": 0.12464825063943863,
212
  "learning_rate": 4.3318385650224224e-06,
213
+ "loss": 0.1333,
214
  "step": 1900
215
  },
216
  {
217
+ "epoch": 8.97,
218
+ "grad_norm": 127.78559112548828,
219
  "learning_rate": 4.03288490284006e-06,
220
+ "loss": 0.1567,
221
  "step": 2000
222
  },
223
  {
224
+ "epoch": 9.0,
225
+ "eval_accuracy": 0.9191918969154358,
226
+ "eval_loss": 0.40891000628471375,
227
+ "eval_runtime": 9.2849,
228
+ "eval_samples_per_second": 21.325,
229
+ "eval_steps_per_second": 5.385,
230
+ "step": 2007
231
  },
232
  {
233
+ "epoch": 9.42,
234
+ "grad_norm": 0.03460687771439552,
235
  "learning_rate": 3.7339312406576984e-06,
236
+ "loss": 0.1313,
237
  "step": 2100
238
  },
239
  {
240
+ "epoch": 9.87,
241
+ "grad_norm": 3.1638216972351074,
242
  "learning_rate": 3.4349775784753366e-06,
243
+ "loss": 0.1043,
244
  "step": 2200
245
  },
246
  {
247
+ "epoch": 10.0,
248
+ "eval_accuracy": 0.9343434572219849,
249
+ "eval_loss": 0.34286314249038696,
250
+ "eval_runtime": 9.3365,
251
+ "eval_samples_per_second": 21.207,
252
+ "eval_steps_per_second": 5.355,
253
+ "step": 2230
254
+ },
255
+ {
256
+ "epoch": 10.31,
257
+ "grad_norm": 0.462053507566452,
258
  "learning_rate": 3.136023916292975e-06,
259
+ "loss": 0.1551,
260
  "step": 2300
261
  },
262
  {
263
+ "epoch": 10.76,
264
+ "grad_norm": 0.5621947050094604,
265
  "learning_rate": 2.8370702541106134e-06,
266
+ "loss": 0.1161,
267
  "step": 2400
268
  },
269
  {
270
+ "epoch": 11.0,
271
+ "eval_accuracy": 0.939393937587738,
272
+ "eval_loss": 0.3534471094608307,
273
+ "eval_runtime": 9.2307,
274
+ "eval_samples_per_second": 21.45,
275
+ "eval_steps_per_second": 5.417,
276
+ "step": 2453
 
 
 
 
 
 
 
277
  },
278
  {
279
+ "epoch": 11.0,
280
+ "step": 2453,
281
+ "total_flos": 6.273609670944864e+17,
282
+ "train_loss": 0.2735439113728912,
283
+ "train_runtime": 1317.5896,
284
+ "train_samples_per_second": 20.276,
285
+ "train_steps_per_second": 2.539
286
  },
287
  {
288
+ "epoch": 11.0,
289
+ "eval_accuracy": 0.9444444179534912,
290
+ "eval_loss": 0.3189202845096588,
291
+ "eval_runtime": 9.0656,
292
+ "eval_samples_per_second": 21.841,
293
+ "eval_steps_per_second": 5.515,
294
+ "step": 2453
295
  }
296
  ],
297
  "logging_steps": 100,
 
299
  "num_input_tokens_seen": 0,
300
  "num_train_epochs": 15,
301
  "save_steps": 500,
302
+ "total_flos": 6.273609670944864e+17,
303
  "train_batch_size": 4,
304
  "trial_name": null,
305
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dfd0a8fac2fa2f9e7a44ceb796236325ad0e73a925cf176a1dd48186e55392f8
3
- size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de65037c614a22836253ad31eda6e76236b1e78bf69197af95578707a3ab6bbe
3
+ size 4984