kanishka commited on
Commit
833954f
1 Parent(s): a3b98d2

End of training

Browse files
Files changed (5) hide show
  1. README.md +14 -2
  2. all_results.json +16 -0
  3. eval_results.json +10 -0
  4. train_results.json +9 -0
  5. trainer_state.json +497 -0
README.md CHANGED
@@ -2,11 +2,23 @@
2
  library_name: transformers
3
  tags:
4
  - generated_from_trainer
 
 
5
  metrics:
6
  - accuracy
7
  model-index:
8
  - name: opt-babylm2-clean-spacy-32k-earlystop_seed-42_1e-3
9
- results: []
 
 
 
 
 
 
 
 
 
 
10
  ---
11
 
12
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -14,7 +26,7 @@ should probably proofread and complete it, then remove this comment. -->
14
 
15
  # opt-babylm2-clean-spacy-32k-earlystop_seed-42_1e-3
16
 
17
- This model was trained from scratch on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
  - Loss: 2.9103
20
  - Accuracy: 0.4305
 
2
  library_name: transformers
3
  tags:
4
  - generated_from_trainer
5
+ datasets:
6
+ - kanishka/babylm2-clean-spacy
7
  metrics:
8
  - accuracy
9
  model-index:
10
  - name: opt-babylm2-clean-spacy-32k-earlystop_seed-42_1e-3
11
+ results:
12
+ - task:
13
+ name: Causal Language Modeling
14
+ type: text-generation
15
+ dataset:
16
+ name: kanishka/babylm2-clean-spacy
17
+ type: kanishka/babylm2-clean-spacy
18
+ metrics:
19
+ - name: Accuracy
20
+ type: accuracy
21
+ value: 0.43054403912594785
22
  ---
23
 
24
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
26
 
27
  # opt-babylm2-clean-spacy-32k-earlystop_seed-42_1e-3
28
 
29
+ This model was trained from scratch on the kanishka/babylm2-clean-spacy dataset.
30
  It achieves the following results on the evaluation set:
31
  - Loss: 2.9103
32
  - Accuracy: 0.4305
all_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 19.990992729846234,
3
+ "eval_accuracy": 0.43054403912594785,
4
+ "eval_loss": 2.910346746444702,
5
+ "eval_runtime": 112.5282,
6
+ "eval_samples": 52440,
7
+ "eval_samples_per_second": 466.016,
8
+ "eval_steps_per_second": 7.287,
9
+ "perplexity": 18.36316482533208,
10
+ "total_flos": 1.298988775636992e+18,
11
+ "train_loss": 3.0016495520987547,
12
+ "train_runtime": 41094.0958,
13
+ "train_samples": 497364,
14
+ "train_samples_per_second": 242.061,
15
+ "train_steps_per_second": 0.945
16
+ }
eval_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 19.990992729846234,
3
+ "eval_accuracy": 0.43054403912594785,
4
+ "eval_loss": 2.910346746444702,
5
+ "eval_runtime": 112.5282,
6
+ "eval_samples": 52440,
7
+ "eval_samples_per_second": 466.016,
8
+ "eval_steps_per_second": 7.287,
9
+ "perplexity": 18.36316482533208
10
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 19.990992729846234,
3
+ "total_flos": 1.298988775636992e+18,
4
+ "train_loss": 3.0016495520987547,
5
+ "train_runtime": 41094.0958,
6
+ "train_samples": 497364,
7
+ "train_samples_per_second": 242.061,
8
+ "train_steps_per_second": 0.945
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,497 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 2.910346746444702,
3
+ "best_model_checkpoint": "models/opt-babylm2-clean-spacy-32k-earlystop_seed-42_1e-3/checkpoint-38840",
4
+ "epoch": 19.990992729846234,
5
+ "eval_steps": 500,
6
+ "global_step": 38840,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.5147011516438268,
13
+ "grad_norm": 0.4890323877334595,
14
+ "learning_rate": 3.125e-05,
15
+ "loss": 5.9107,
16
+ "step": 1000
17
+ },
18
+ {
19
+ "epoch": 0.9995496364923117,
20
+ "eval_accuracy": 0.32687433630965734,
21
+ "eval_loss": 3.988743305206299,
22
+ "eval_runtime": 113.4579,
23
+ "eval_samples_per_second": 462.198,
24
+ "eval_steps_per_second": 7.227,
25
+ "step": 1942
26
+ },
27
+ {
28
+ "epoch": 1.0294023032876536,
29
+ "grad_norm": 0.6617768406867981,
30
+ "learning_rate": 6.25e-05,
31
+ "loss": 4.1795,
32
+ "step": 2000
33
+ },
34
+ {
35
+ "epoch": 1.5441034549314803,
36
+ "grad_norm": 0.5740628838539124,
37
+ "learning_rate": 9.375e-05,
38
+ "loss": 3.7896,
39
+ "step": 3000
40
+ },
41
+ {
42
+ "epoch": 1.9996139741362673,
43
+ "eval_accuracy": 0.36569809006745335,
44
+ "eval_loss": 3.523646593093872,
45
+ "eval_runtime": 113.8096,
46
+ "eval_samples_per_second": 460.769,
47
+ "eval_steps_per_second": 7.205,
48
+ "step": 3885
49
+ },
50
+ {
51
+ "epoch": 2.058804606575307,
52
+ "grad_norm": 0.5854473114013672,
53
+ "learning_rate": 0.000125,
54
+ "loss": 3.5494,
55
+ "step": 4000
56
+ },
57
+ {
58
+ "epoch": 2.573505758219134,
59
+ "grad_norm": 0.5083662271499634,
60
+ "learning_rate": 0.00015625,
61
+ "loss": 3.3813,
62
+ "step": 5000
63
+ },
64
+ {
65
+ "epoch": 2.9996783117802224,
66
+ "eval_accuracy": 0.38593522382255724,
67
+ "eval_loss": 3.304034948348999,
68
+ "eval_runtime": 113.7364,
69
+ "eval_samples_per_second": 461.066,
70
+ "eval_steps_per_second": 7.21,
71
+ "step": 5828
72
+ },
73
+ {
74
+ "epoch": 3.088206909862961,
75
+ "grad_norm": 0.5111091732978821,
76
+ "learning_rate": 0.0001875,
77
+ "loss": 3.2658,
78
+ "step": 6000
79
+ },
80
+ {
81
+ "epoch": 3.6029080615067874,
82
+ "grad_norm": 0.46191349625587463,
83
+ "learning_rate": 0.00021875,
84
+ "loss": 3.174,
85
+ "step": 7000
86
+ },
87
+ {
88
+ "epoch": 3.999742649424178,
89
+ "eval_accuracy": 0.3961827522771122,
90
+ "eval_loss": 3.1921420097351074,
91
+ "eval_runtime": 113.4299,
92
+ "eval_samples_per_second": 462.312,
93
+ "eval_steps_per_second": 7.229,
94
+ "step": 7771
95
+ },
96
+ {
97
+ "epoch": 4.117609213150614,
98
+ "grad_norm": 0.40757909417152405,
99
+ "learning_rate": 0.00025,
100
+ "loss": 3.1123,
101
+ "step": 8000
102
+ },
103
+ {
104
+ "epoch": 4.632310364794441,
105
+ "grad_norm": 0.38333743810653687,
106
+ "learning_rate": 0.00028125000000000003,
107
+ "loss": 3.0533,
108
+ "step": 9000
109
+ },
110
+ {
111
+ "epoch": 4.999806987068133,
112
+ "eval_accuracy": 0.4026235772722514,
113
+ "eval_loss": 3.126554250717163,
114
+ "eval_runtime": 113.5855,
115
+ "eval_samples_per_second": 461.679,
116
+ "eval_steps_per_second": 7.219,
117
+ "step": 9714
118
+ },
119
+ {
120
+ "epoch": 5.147011516438268,
121
+ "grad_norm": 0.35808226466178894,
122
+ "learning_rate": 0.0003125,
123
+ "loss": 3.0181,
124
+ "step": 10000
125
+ },
126
+ {
127
+ "epoch": 5.661712668082095,
128
+ "grad_norm": 0.33792659640312195,
129
+ "learning_rate": 0.00034375,
130
+ "loss": 2.9768,
131
+ "step": 11000
132
+ },
133
+ {
134
+ "epoch": 5.999871324712089,
135
+ "eval_accuracy": 0.4071174526255964,
136
+ "eval_loss": 3.0837907791137695,
137
+ "eval_runtime": 113.7293,
138
+ "eval_samples_per_second": 461.095,
139
+ "eval_steps_per_second": 7.21,
140
+ "step": 11657
141
+ },
142
+ {
143
+ "epoch": 6.176413819725922,
144
+ "grad_norm": 0.3134087324142456,
145
+ "learning_rate": 0.000375,
146
+ "loss": 2.9528,
147
+ "step": 12000
148
+ },
149
+ {
150
+ "epoch": 6.691114971369748,
151
+ "grad_norm": 0.2982269525527954,
152
+ "learning_rate": 0.00040625000000000004,
153
+ "loss": 2.9232,
154
+ "step": 13000
155
+ },
156
+ {
157
+ "epoch": 6.9999356623560445,
158
+ "eval_accuracy": 0.41005010394699454,
159
+ "eval_loss": 3.054988145828247,
160
+ "eval_runtime": 113.7694,
161
+ "eval_samples_per_second": 460.932,
162
+ "eval_steps_per_second": 7.208,
163
+ "step": 13600
164
+ },
165
+ {
166
+ "epoch": 7.205816123013575,
167
+ "grad_norm": 0.3095534145832062,
168
+ "learning_rate": 0.0004375,
169
+ "loss": 2.9047,
170
+ "step": 14000
171
+ },
172
+ {
173
+ "epoch": 7.720517274657402,
174
+ "grad_norm": 0.28180333971977234,
175
+ "learning_rate": 0.0004686875,
176
+ "loss": 2.8863,
177
+ "step": 15000
178
+ },
179
+ {
180
+ "epoch": 8.0,
181
+ "eval_accuracy": 0.41221526749525134,
182
+ "eval_loss": 3.0362703800201416,
183
+ "eval_runtime": 113.4545,
184
+ "eval_samples_per_second": 462.212,
185
+ "eval_steps_per_second": 7.228,
186
+ "step": 15543
187
+ },
188
+ {
189
+ "epoch": 8.235218426301229,
190
+ "grad_norm": 0.27651330828666687,
191
+ "learning_rate": 0.0004999375,
192
+ "loss": 2.8645,
193
+ "step": 16000
194
+ },
195
+ {
196
+ "epoch": 8.749919577945056,
197
+ "grad_norm": 0.2618395984172821,
198
+ "learning_rate": 0.0005311875000000001,
199
+ "loss": 2.8563,
200
+ "step": 17000
201
+ },
202
+ {
203
+ "epoch": 8.999549636492311,
204
+ "eval_accuracy": 0.41385650827836856,
205
+ "eval_loss": 3.0208170413970947,
206
+ "eval_runtime": 113.5361,
207
+ "eval_samples_per_second": 461.879,
208
+ "eval_steps_per_second": 7.222,
209
+ "step": 17485
210
+ },
211
+ {
212
+ "epoch": 9.264620729588882,
213
+ "grad_norm": 0.2651124894618988,
214
+ "learning_rate": 0.00056240625,
215
+ "loss": 2.8341,
216
+ "step": 18000
217
+ },
218
+ {
219
+ "epoch": 9.77932188123271,
220
+ "grad_norm": 0.25122708082199097,
221
+ "learning_rate": 0.00059365625,
222
+ "loss": 2.8356,
223
+ "step": 19000
224
+ },
225
+ {
226
+ "epoch": 9.999613974136267,
227
+ "eval_accuracy": 0.41513595369497913,
228
+ "eval_loss": 3.011690139770508,
229
+ "eval_runtime": 113.8076,
230
+ "eval_samples_per_second": 460.778,
231
+ "eval_steps_per_second": 7.205,
232
+ "step": 19428
233
+ },
234
+ {
235
+ "epoch": 10.294023032876536,
236
+ "grad_norm": 0.24296793341636658,
237
+ "learning_rate": 0.00062490625,
238
+ "loss": 2.8116,
239
+ "step": 20000
240
+ },
241
+ {
242
+ "epoch": 10.808724184520363,
243
+ "grad_norm": 0.2230454832315445,
244
+ "learning_rate": 0.0006561562500000001,
245
+ "loss": 2.816,
246
+ "step": 21000
247
+ },
248
+ {
249
+ "epoch": 10.999678311780222,
250
+ "eval_accuracy": 0.4161677958750243,
251
+ "eval_loss": 3.0030288696289062,
252
+ "eval_runtime": 113.595,
253
+ "eval_samples_per_second": 461.64,
254
+ "eval_steps_per_second": 7.219,
255
+ "step": 21371
256
+ },
257
+ {
258
+ "epoch": 11.32342533616419,
259
+ "grad_norm": 0.2214185744524002,
260
+ "learning_rate": 0.00068734375,
261
+ "loss": 2.7929,
262
+ "step": 22000
263
+ },
264
+ {
265
+ "epoch": 11.838126487808017,
266
+ "grad_norm": 0.20495054125785828,
267
+ "learning_rate": 0.00071859375,
268
+ "loss": 2.8069,
269
+ "step": 23000
270
+ },
271
+ {
272
+ "epoch": 11.999742649424178,
273
+ "eval_accuracy": 0.41698351804489914,
274
+ "eval_loss": 2.9951212406158447,
275
+ "eval_runtime": 113.473,
276
+ "eval_samples_per_second": 462.136,
277
+ "eval_steps_per_second": 7.226,
278
+ "step": 23314
279
+ },
280
+ {
281
+ "epoch": 12.352827639451844,
282
+ "grad_norm": 0.21121680736541748,
283
+ "learning_rate": 0.0007498437500000001,
284
+ "loss": 2.7777,
285
+ "step": 24000
286
+ },
287
+ {
288
+ "epoch": 12.86752879109567,
289
+ "grad_norm": 0.20208890736103058,
290
+ "learning_rate": 0.00078109375,
291
+ "loss": 2.7941,
292
+ "step": 25000
293
+ },
294
+ {
295
+ "epoch": 12.999806987068133,
296
+ "eval_accuracy": 0.41746990024079805,
297
+ "eval_loss": 2.9923195838928223,
298
+ "eval_runtime": 113.7475,
299
+ "eval_samples_per_second": 461.021,
300
+ "eval_steps_per_second": 7.209,
301
+ "step": 25257
302
+ },
303
+ {
304
+ "epoch": 13.382229942739498,
305
+ "grad_norm": 0.21585527062416077,
306
+ "learning_rate": 0.0008123125,
307
+ "loss": 2.7644,
308
+ "step": 26000
309
+ },
310
+ {
311
+ "epoch": 13.896931094383323,
312
+ "grad_norm": 0.19069883227348328,
313
+ "learning_rate": 0.00084353125,
314
+ "loss": 2.7889,
315
+ "step": 27000
316
+ },
317
+ {
318
+ "epoch": 13.999871324712089,
319
+ "eval_accuracy": 0.4181739728690866,
320
+ "eval_loss": 2.9887681007385254,
321
+ "eval_runtime": 113.6405,
322
+ "eval_samples_per_second": 461.455,
323
+ "eval_steps_per_second": 7.216,
324
+ "step": 27200
325
+ },
326
+ {
327
+ "epoch": 14.41163224602715,
328
+ "grad_norm": 0.19703735411167145,
329
+ "learning_rate": 0.00087478125,
330
+ "loss": 2.757,
331
+ "step": 28000
332
+ },
333
+ {
334
+ "epoch": 14.926333397670977,
335
+ "grad_norm": 0.18608474731445312,
336
+ "learning_rate": 0.0009060312499999999,
337
+ "loss": 2.7802,
338
+ "step": 29000
339
+ },
340
+ {
341
+ "epoch": 14.999935662356044,
342
+ "eval_accuracy": 0.41862535708409987,
343
+ "eval_loss": 2.9839344024658203,
344
+ "eval_runtime": 113.3429,
345
+ "eval_samples_per_second": 462.667,
346
+ "eval_steps_per_second": 7.235,
347
+ "step": 29143
348
+ },
349
+ {
350
+ "epoch": 15.441034549314804,
351
+ "grad_norm": 0.18645653128623962,
352
+ "learning_rate": 0.00093725,
353
+ "loss": 2.7483,
354
+ "step": 30000
355
+ },
356
+ {
357
+ "epoch": 15.95573570095863,
358
+ "grad_norm": 0.18724338710308075,
359
+ "learning_rate": 0.0009685000000000001,
360
+ "loss": 2.7802,
361
+ "step": 31000
362
+ },
363
+ {
364
+ "epoch": 16.0,
365
+ "eval_accuracy": 0.4189873767966378,
366
+ "eval_loss": 2.9820570945739746,
367
+ "eval_runtime": 113.2324,
368
+ "eval_samples_per_second": 463.118,
369
+ "eval_steps_per_second": 7.242,
370
+ "step": 31086
371
+ },
372
+ {
373
+ "epoch": 16.470436852602457,
374
+ "grad_norm": 0.18728293478488922,
375
+ "learning_rate": 0.00099971875,
376
+ "loss": 2.7412,
377
+ "step": 32000
378
+ },
379
+ {
380
+ "epoch": 16.985138004246284,
381
+ "grad_norm": 0.18331313133239746,
382
+ "learning_rate": 0.0008551169590643275,
383
+ "loss": 2.7665,
384
+ "step": 33000
385
+ },
386
+ {
387
+ "epoch": 16.99954963649231,
388
+ "eval_accuracy": 0.4212147589775804,
389
+ "eval_loss": 2.962628126144409,
390
+ "eval_runtime": 112.7133,
391
+ "eval_samples_per_second": 465.251,
392
+ "eval_steps_per_second": 7.275,
393
+ "step": 33028
394
+ },
395
+ {
396
+ "epoch": 17.49983915589011,
397
+ "grad_norm": 0.17958644032478333,
398
+ "learning_rate": 0.0007089181286549707,
399
+ "loss": 2.6908,
400
+ "step": 34000
401
+ },
402
+ {
403
+ "epoch": 17.999613974136267,
404
+ "eval_accuracy": 0.42467125828210767,
405
+ "eval_loss": 2.937788963317871,
406
+ "eval_runtime": 112.8442,
407
+ "eval_samples_per_second": 464.711,
408
+ "eval_steps_per_second": 7.267,
409
+ "step": 34971
410
+ },
411
+ {
412
+ "epoch": 18.014540307533938,
413
+ "grad_norm": 0.19127266108989716,
414
+ "learning_rate": 0.0005627192982456141,
415
+ "loss": 2.6939,
416
+ "step": 35000
417
+ },
418
+ {
419
+ "epoch": 18.529241459177765,
420
+ "grad_norm": 0.17924794554710388,
421
+ "learning_rate": 0.0004166666666666667,
422
+ "loss": 2.6058,
423
+ "step": 36000
424
+ },
425
+ {
426
+ "epoch": 18.999678311780222,
427
+ "eval_accuracy": 0.4284452072209509,
428
+ "eval_loss": 2.914487600326538,
429
+ "eval_runtime": 113.021,
430
+ "eval_samples_per_second": 463.985,
431
+ "eval_steps_per_second": 7.255,
432
+ "step": 36914
433
+ },
434
+ {
435
+ "epoch": 19.043942610821592,
436
+ "grad_norm": 0.19451527297496796,
437
+ "learning_rate": 0.00027046783625730997,
438
+ "loss": 2.5978,
439
+ "step": 37000
440
+ },
441
+ {
442
+ "epoch": 19.55864376246542,
443
+ "grad_norm": 0.19833345711231232,
444
+ "learning_rate": 0.0001242690058479532,
445
+ "loss": 2.505,
446
+ "step": 38000
447
+ },
448
+ {
449
+ "epoch": 19.990992729846234,
450
+ "eval_accuracy": 0.43054403912594785,
451
+ "eval_loss": 2.910346746444702,
452
+ "eval_runtime": 112.944,
453
+ "eval_samples_per_second": 464.301,
454
+ "eval_steps_per_second": 7.26,
455
+ "step": 38840
456
+ },
457
+ {
458
+ "epoch": 19.990992729846234,
459
+ "step": 38840,
460
+ "total_flos": 1.298988775636992e+18,
461
+ "train_loss": 3.0016495520987547,
462
+ "train_runtime": 41094.0958,
463
+ "train_samples_per_second": 242.061,
464
+ "train_steps_per_second": 0.945
465
+ }
466
+ ],
467
+ "logging_steps": 1000,
468
+ "max_steps": 38840,
469
+ "num_input_tokens_seen": 0,
470
+ "num_train_epochs": 20,
471
+ "save_steps": 500,
472
+ "stateful_callbacks": {
473
+ "EarlyStoppingCallback": {
474
+ "args": {
475
+ "early_stopping_patience": 3,
476
+ "early_stopping_threshold": 0.0
477
+ },
478
+ "attributes": {
479
+ "early_stopping_patience_counter": 0
480
+ }
481
+ },
482
+ "TrainerControl": {
483
+ "args": {
484
+ "should_epoch_stop": false,
485
+ "should_evaluate": false,
486
+ "should_log": false,
487
+ "should_save": true,
488
+ "should_training_stop": true
489
+ },
490
+ "attributes": {}
491
+ }
492
+ },
493
+ "total_flos": 1.298988775636992e+18,
494
+ "train_batch_size": 32,
495
+ "trial_name": null,
496
+ "trial_params": null
497
+ }