DazMashaly commited on
Commit
8bf86b6
1 Parent(s): eb40f38

Training in progress, step 100

Browse files
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 4.0,
3
- "total_flos": 2.953718820314874e+18,
4
- "train_loss": 0.09724263496534437,
5
- "train_runtime": 2056.7864,
6
- "train_samples_per_second": 18.53,
7
- "train_steps_per_second": 1.159
8
  }
 
1
  {
2
+ "epoch": 6.0,
3
+ "total_flos": 4.98440050928135e+18,
4
+ "train_loss": 0.10442606876627426,
5
+ "train_runtime": 2908.5673,
6
+ "train_samples_per_second": 22.112,
7
+ "train_steps_per_second": 1.382
8
  }
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "google/vit-large-patch16-224-in21k",
3
  "architectures": [
4
  "ViTForImageClassification"
5
  ],
@@ -7,7 +7,7 @@
7
  "encoder_stride": 16,
8
  "hidden_act": "gelu",
9
  "hidden_dropout_prob": 0.0,
10
- "hidden_size": 1024,
11
  "id2label": {
12
  "0": "Ascariasis",
13
  "1": "Babesia",
@@ -27,7 +27,7 @@
27
  },
28
  "image_size": 224,
29
  "initializer_range": 0.02,
30
- "intermediate_size": 4096,
31
  "label2id": {
32
  "Ascariasis": "0",
33
  "Babesia": "1",
@@ -47,9 +47,9 @@
47
  },
48
  "layer_norm_eps": 1e-12,
49
  "model_type": "vit",
50
- "num_attention_heads": 16,
51
  "num_channels": 3,
52
- "num_hidden_layers": 24,
53
  "patch_size": 16,
54
  "problem_type": "single_label_classification",
55
  "qkv_bias": true,
 
1
  {
2
+ "_name_or_path": "google/vit-base-patch16-224-in21k",
3
  "architectures": [
4
  "ViTForImageClassification"
5
  ],
 
7
  "encoder_stride": 16,
8
  "hidden_act": "gelu",
9
  "hidden_dropout_prob": 0.0,
10
+ "hidden_size": 768,
11
  "id2label": {
12
  "0": "Ascariasis",
13
  "1": "Babesia",
 
27
  },
28
  "image_size": 224,
29
  "initializer_range": 0.02,
30
+ "intermediate_size": 3072,
31
  "label2id": {
32
  "Ascariasis": "0",
33
  "Babesia": "1",
 
47
  },
48
  "layer_norm_eps": 1e-12,
49
  "model_type": "vit",
50
+ "num_attention_heads": 12,
51
  "num_channels": 3,
52
+ "num_hidden_layers": 12,
53
  "patch_size": 16,
54
  "problem_type": "single_label_classification",
55
  "qkv_bias": true,
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9251cefe9001b894642b75748b6a11954bc8f7abbc2cb9ccba50885973d26f64
3
- size 1213401965
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e58ef044f580b8df606dacb935a7d4dfd49aa772163c1c6e0d5ff1d36f2cc9c2
3
+ size 343308717
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 4.0,
3
- "total_flos": 2.953718820314874e+18,
4
- "train_loss": 0.09724263496534437,
5
- "train_runtime": 2056.7864,
6
- "train_samples_per_second": 18.53,
7
- "train_steps_per_second": 1.159
8
  }
 
1
  {
2
+ "epoch": 6.0,
3
+ "total_flos": 4.98440050928135e+18,
4
+ "train_loss": 0.10442606876627426,
5
+ "train_runtime": 2908.5673,
6
+ "train_samples_per_second": 22.112,
7
+ "train_steps_per_second": 1.382
8
  }
trainer_state.json CHANGED
@@ -1,1663 +1,2800 @@
1
  {
2
- "best_metric": 0.04898680001497269,
3
- "best_model_checkpoint": "./vit-base-beans-demo-v5/checkpoint-2200",
4
- "epoch": 4.0,
5
  "eval_steps": 100,
6
- "global_step": 2384,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.02,
13
- "learning_rate": 0.00019916107382550338,
14
- "loss": 0.5752,
15
  "step": 10
16
  },
17
  {
18
  "epoch": 0.03,
19
- "learning_rate": 0.00019832214765100673,
20
- "loss": 0.3639,
21
  "step": 20
22
  },
23
  {
24
- "epoch": 0.05,
25
- "learning_rate": 0.00019748322147651007,
26
- "loss": 0.414,
27
  "step": 30
28
  },
29
  {
30
- "epoch": 0.07,
31
- "learning_rate": 0.00019664429530201342,
32
- "loss": 0.4994,
33
  "step": 40
34
  },
35
  {
36
- "epoch": 0.08,
37
- "learning_rate": 0.0001958053691275168,
38
- "loss": 0.3953,
39
  "step": 50
40
  },
41
  {
42
- "epoch": 0.1,
43
- "learning_rate": 0.00019496644295302013,
44
- "loss": 0.1514,
45
  "step": 60
46
  },
47
  {
48
- "epoch": 0.12,
49
- "learning_rate": 0.0001941275167785235,
50
- "loss": 0.4222,
51
  "step": 70
52
  },
53
  {
54
- "epoch": 0.13,
55
- "learning_rate": 0.00019328859060402688,
56
- "loss": 0.4522,
57
  "step": 80
58
  },
59
  {
60
- "epoch": 0.15,
61
- "learning_rate": 0.00019244966442953022,
62
- "loss": 0.39,
63
  "step": 90
64
  },
65
  {
66
- "epoch": 0.17,
67
- "learning_rate": 0.00019161073825503357,
68
- "loss": 0.1764,
69
  "step": 100
70
  },
71
  {
72
- "epoch": 0.17,
73
- "eval_accuracy": 0.9345088161209067,
74
- "eval_loss": 0.24242226779460907,
75
- "eval_runtime": 37.0601,
76
- "eval_samples_per_second": 64.274,
77
- "eval_steps_per_second": 8.041,
78
  "step": 100
79
  },
80
  {
81
- "epoch": 0.18,
82
- "learning_rate": 0.0001907718120805369,
83
- "loss": 0.199,
84
  "step": 110
85
  },
86
  {
87
- "epoch": 0.2,
88
- "learning_rate": 0.00018993288590604028,
89
- "loss": 0.2534,
90
  "step": 120
91
  },
92
  {
93
- "epoch": 0.22,
94
- "learning_rate": 0.00018909395973154363,
95
- "loss": 0.2201,
96
  "step": 130
97
  },
98
  {
99
- "epoch": 0.23,
100
- "learning_rate": 0.000188255033557047,
101
- "loss": 0.1656,
102
  "step": 140
103
  },
104
  {
105
- "epoch": 0.25,
106
- "learning_rate": 0.00018741610738255034,
107
- "loss": 0.3582,
108
  "step": 150
109
  },
110
  {
111
- "epoch": 0.27,
112
- "learning_rate": 0.0001865771812080537,
113
- "loss": 0.3158,
114
  "step": 160
115
  },
116
  {
117
- "epoch": 0.29,
118
- "learning_rate": 0.00018573825503355706,
119
- "loss": 0.2936,
120
  "step": 170
121
  },
122
  {
123
- "epoch": 0.3,
124
- "learning_rate": 0.0001848993288590604,
125
- "loss": 0.3009,
126
  "step": 180
127
  },
128
  {
129
- "epoch": 0.32,
130
- "learning_rate": 0.00018406040268456378,
131
- "loss": 0.3815,
132
  "step": 190
133
  },
134
  {
135
- "epoch": 0.34,
136
- "learning_rate": 0.00018322147651006712,
137
- "loss": 0.2401,
138
  "step": 200
139
  },
140
  {
141
- "epoch": 0.34,
142
- "eval_accuracy": 0.9160369437447523,
143
- "eval_loss": 0.29085931181907654,
144
- "eval_runtime": 37.2208,
145
- "eval_samples_per_second": 63.996,
146
- "eval_steps_per_second": 8.006,
147
  "step": 200
148
  },
149
  {
150
- "epoch": 0.35,
151
- "learning_rate": 0.00018238255033557047,
152
- "loss": 0.3746,
153
  "step": 210
154
  },
155
  {
156
- "epoch": 0.37,
157
- "learning_rate": 0.00018154362416107384,
158
- "loss": 0.3423,
159
  "step": 220
160
  },
161
  {
162
- "epoch": 0.39,
163
- "learning_rate": 0.00018070469798657718,
164
- "loss": 0.1224,
165
  "step": 230
166
  },
167
  {
168
- "epoch": 0.4,
169
- "learning_rate": 0.00017986577181208056,
170
- "loss": 0.2699,
171
  "step": 240
172
  },
173
  {
174
- "epoch": 0.42,
175
- "learning_rate": 0.0001790268456375839,
176
- "loss": 0.2199,
177
  "step": 250
178
  },
179
  {
180
- "epoch": 0.44,
181
- "learning_rate": 0.00017818791946308727,
182
- "loss": 0.4065,
183
  "step": 260
184
  },
185
  {
186
- "epoch": 0.45,
187
- "learning_rate": 0.00017734899328859062,
188
- "loss": 0.4393,
189
  "step": 270
190
  },
191
  {
192
- "epoch": 0.47,
193
- "learning_rate": 0.00017651006711409396,
194
- "loss": 0.3829,
195
  "step": 280
196
  },
197
  {
198
- "epoch": 0.49,
199
- "learning_rate": 0.00017567114093959733,
200
- "loss": 0.2467,
201
  "step": 290
202
  },
203
  {
204
- "epoch": 0.5,
205
- "learning_rate": 0.00017483221476510068,
206
- "loss": 0.1986,
207
  "step": 300
208
  },
209
  {
210
- "epoch": 0.5,
211
- "eval_accuracy": 0.9357682619647355,
212
- "eval_loss": 0.22584371268749237,
213
- "eval_runtime": 37.6779,
214
- "eval_samples_per_second": 63.22,
215
- "eval_steps_per_second": 7.909,
216
  "step": 300
217
  },
218
  {
219
- "epoch": 0.52,
220
- "learning_rate": 0.00017399328859060405,
221
- "loss": 0.3105,
222
  "step": 310
223
  },
224
  {
225
- "epoch": 0.54,
226
- "learning_rate": 0.0001731543624161074,
227
- "loss": 0.1708,
228
  "step": 320
229
  },
230
  {
231
- "epoch": 0.55,
232
- "learning_rate": 0.00017231543624161074,
233
- "loss": 0.2205,
234
  "step": 330
235
  },
236
  {
237
- "epoch": 0.57,
238
- "learning_rate": 0.00017147651006711408,
239
- "loss": 0.1301,
240
  "step": 340
241
  },
242
  {
243
- "epoch": 0.59,
244
- "learning_rate": 0.00017063758389261746,
245
- "loss": 0.2069,
246
  "step": 350
247
  },
248
  {
249
- "epoch": 0.6,
250
- "learning_rate": 0.00016979865771812083,
251
- "loss": 0.098,
252
  "step": 360
253
  },
254
  {
255
- "epoch": 0.62,
256
- "learning_rate": 0.00016895973154362417,
257
- "loss": 0.2552,
258
  "step": 370
259
  },
260
  {
261
- "epoch": 0.64,
262
- "learning_rate": 0.00016812080536912754,
263
- "loss": 0.2953,
264
  "step": 380
265
  },
266
  {
267
- "epoch": 0.65,
268
- "learning_rate": 0.00016728187919463086,
269
- "loss": 0.2195,
270
  "step": 390
271
  },
272
  {
273
- "epoch": 0.67,
274
- "learning_rate": 0.00016644295302013423,
275
- "loss": 0.1579,
276
  "step": 400
277
  },
278
  {
279
- "epoch": 0.67,
280
- "eval_accuracy": 0.9240134340890008,
281
- "eval_loss": 0.2699319124221802,
282
- "eval_runtime": 37.7055,
283
- "eval_samples_per_second": 63.174,
284
- "eval_steps_per_second": 7.903,
285
  "step": 400
286
  },
287
  {
288
- "epoch": 0.69,
289
- "learning_rate": 0.00016560402684563758,
290
- "loss": 0.3875,
291
  "step": 410
292
  },
293
  {
294
- "epoch": 0.7,
295
- "learning_rate": 0.00016476510067114095,
296
- "loss": 0.254,
297
  "step": 420
298
  },
299
  {
300
- "epoch": 0.72,
301
- "learning_rate": 0.00016401006711409396,
302
- "loss": 0.2794,
303
  "step": 430
304
  },
305
  {
306
- "epoch": 0.74,
307
- "learning_rate": 0.00016317114093959733,
308
- "loss": 0.1986,
309
  "step": 440
310
  },
311
  {
312
- "epoch": 0.76,
313
- "learning_rate": 0.00016233221476510067,
314
- "loss": 0.1392,
315
  "step": 450
316
  },
317
  {
318
- "epoch": 0.77,
319
- "learning_rate": 0.00016149328859060404,
320
- "loss": 0.1405,
321
  "step": 460
322
  },
323
  {
324
- "epoch": 0.79,
325
- "learning_rate": 0.0001606543624161074,
326
- "loss": 0.0757,
327
  "step": 470
328
  },
329
  {
330
- "epoch": 0.81,
331
- "learning_rate": 0.00015981543624161076,
332
- "loss": 0.1505,
333
  "step": 480
334
  },
335
  {
336
- "epoch": 0.82,
337
- "learning_rate": 0.0001589765100671141,
338
- "loss": 0.3554,
339
  "step": 490
340
  },
341
  {
342
- "epoch": 0.84,
343
- "learning_rate": 0.00015813758389261745,
344
- "loss": 0.21,
345
  "step": 500
346
  },
347
  {
348
- "epoch": 0.84,
349
- "eval_accuracy": 0.9634760705289672,
350
- "eval_loss": 0.1416286677122116,
351
- "eval_runtime": 37.8399,
352
- "eval_samples_per_second": 62.949,
353
- "eval_steps_per_second": 7.875,
354
  "step": 500
355
  },
356
  {
357
- "epoch": 0.86,
358
- "learning_rate": 0.00015729865771812082,
359
- "loss": 0.24,
360
  "step": 510
361
  },
362
  {
363
- "epoch": 0.87,
364
- "learning_rate": 0.00015645973154362417,
365
- "loss": 0.161,
366
  "step": 520
367
  },
368
  {
369
- "epoch": 0.89,
370
- "learning_rate": 0.00015562080536912754,
371
- "loss": 0.1271,
372
  "step": 530
373
  },
374
  {
375
- "epoch": 0.91,
376
- "learning_rate": 0.00015478187919463088,
377
- "loss": 0.2553,
378
  "step": 540
379
  },
380
  {
381
- "epoch": 0.92,
382
- "learning_rate": 0.00015394295302013423,
383
- "loss": 0.223,
384
  "step": 550
385
  },
386
  {
387
- "epoch": 0.94,
388
- "learning_rate": 0.00015310402684563757,
389
- "loss": 0.1555,
390
  "step": 560
391
  },
392
  {
393
- "epoch": 0.96,
394
- "learning_rate": 0.00015226510067114095,
395
- "loss": 0.2612,
396
  "step": 570
397
  },
398
  {
399
- "epoch": 0.97,
400
- "learning_rate": 0.00015142617449664432,
401
- "loss": 0.1819,
402
  "step": 580
403
  },
404
  {
405
- "epoch": 0.99,
406
- "learning_rate": 0.00015058724832214766,
407
- "loss": 0.2398,
408
  "step": 590
409
  },
410
  {
411
- "epoch": 1.01,
412
- "learning_rate": 0.00014974832214765103,
413
- "loss": 0.0873,
414
  "step": 600
415
  },
416
  {
417
- "epoch": 1.01,
418
- "eval_accuracy": 0.9466834592779177,
419
- "eval_loss": 0.1723610907793045,
420
- "eval_runtime": 36.2673,
421
- "eval_samples_per_second": 65.679,
422
- "eval_steps_per_second": 8.217,
423
  "step": 600
424
  },
425
  {
426
- "epoch": 1.02,
427
- "learning_rate": 0.00014890939597315435,
428
- "loss": 0.1876,
429
  "step": 610
430
  },
431
  {
432
- "epoch": 1.04,
433
- "learning_rate": 0.00014807046979865772,
434
- "loss": 0.0848,
435
  "step": 620
436
  },
437
  {
438
- "epoch": 1.06,
439
- "learning_rate": 0.00014723154362416107,
440
- "loss": 0.0784,
441
  "step": 630
442
  },
443
  {
444
- "epoch": 1.07,
445
- "learning_rate": 0.00014639261744966444,
446
- "loss": 0.0697,
447
  "step": 640
448
  },
449
  {
450
- "epoch": 1.09,
451
- "learning_rate": 0.0001455536912751678,
452
- "loss": 0.1411,
453
  "step": 650
454
  },
455
  {
456
- "epoch": 1.11,
457
- "learning_rate": 0.00014471476510067116,
458
- "loss": 0.0589,
459
  "step": 660
460
  },
461
  {
462
- "epoch": 1.12,
463
- "learning_rate": 0.0001438758389261745,
464
- "loss": 0.0867,
465
  "step": 670
466
  },
467
  {
468
- "epoch": 1.14,
469
- "learning_rate": 0.00014303691275167785,
470
- "loss": 0.0671,
471
  "step": 680
472
  },
473
  {
474
- "epoch": 1.16,
475
- "learning_rate": 0.00014219798657718122,
476
- "loss": 0.0821,
477
  "step": 690
478
  },
479
  {
480
- "epoch": 1.17,
481
- "learning_rate": 0.00014135906040268456,
482
- "loss": 0.1594,
483
  "step": 700
484
  },
485
  {
486
- "epoch": 1.17,
487
- "eval_accuracy": 0.9319899244332494,
488
- "eval_loss": 0.24392056465148926,
489
- "eval_runtime": 37.3417,
490
- "eval_samples_per_second": 63.789,
491
- "eval_steps_per_second": 7.98,
492
  "step": 700
493
  },
494
  {
495
- "epoch": 1.19,
496
- "learning_rate": 0.00014052013422818793,
497
- "loss": 0.1412,
498
  "step": 710
499
  },
500
  {
501
- "epoch": 1.21,
502
- "learning_rate": 0.00013968120805369128,
503
- "loss": 0.1345,
504
  "step": 720
505
  },
506
  {
507
- "epoch": 1.22,
508
- "learning_rate": 0.00013884228187919462,
509
- "loss": 0.0804,
510
  "step": 730
511
  },
512
  {
513
- "epoch": 1.24,
514
- "learning_rate": 0.000138003355704698,
515
- "loss": 0.0858,
516
  "step": 740
517
  },
518
  {
519
- "epoch": 1.26,
520
- "learning_rate": 0.00013716442953020134,
521
- "loss": 0.1313,
522
  "step": 750
523
  },
524
  {
525
- "epoch": 1.28,
526
- "learning_rate": 0.0001363255033557047,
527
- "loss": 0.0775,
528
  "step": 760
529
  },
530
  {
531
- "epoch": 1.29,
532
- "learning_rate": 0.00013548657718120806,
533
- "loss": 0.119,
534
  "step": 770
535
  },
536
  {
537
- "epoch": 1.31,
538
- "learning_rate": 0.00013464765100671143,
539
- "loss": 0.0896,
540
  "step": 780
541
  },
542
  {
543
- "epoch": 1.33,
544
- "learning_rate": 0.00013380872483221477,
545
- "loss": 0.0962,
546
  "step": 790
547
  },
548
  {
549
- "epoch": 1.34,
550
- "learning_rate": 0.00013296979865771812,
551
- "loss": 0.0824,
552
  "step": 800
553
  },
554
  {
555
- "epoch": 1.34,
556
- "eval_accuracy": 0.9701931150293871,
557
- "eval_loss": 0.10386788845062256,
558
- "eval_runtime": 37.9403,
559
- "eval_samples_per_second": 62.783,
560
- "eval_steps_per_second": 7.854,
561
  "step": 800
562
  },
563
  {
564
- "epoch": 1.36,
565
- "learning_rate": 0.0001321308724832215,
566
- "loss": 0.0185,
567
  "step": 810
568
  },
569
  {
570
- "epoch": 1.38,
571
- "learning_rate": 0.00013129194630872484,
572
- "loss": 0.0723,
573
  "step": 820
574
  },
575
  {
576
- "epoch": 1.39,
577
- "learning_rate": 0.0001304530201342282,
578
- "loss": 0.1575,
579
  "step": 830
580
  },
581
  {
582
- "epoch": 1.41,
583
- "learning_rate": 0.00012961409395973155,
584
- "loss": 0.1435,
585
  "step": 840
586
  },
587
  {
588
- "epoch": 1.43,
589
- "learning_rate": 0.0001287751677852349,
590
- "loss": 0.0303,
591
  "step": 850
592
  },
593
  {
594
- "epoch": 1.44,
595
- "learning_rate": 0.00012793624161073827,
596
- "loss": 0.1279,
597
  "step": 860
598
  },
599
  {
600
- "epoch": 1.46,
601
- "learning_rate": 0.00012709731543624161,
602
- "loss": 0.1075,
603
  "step": 870
604
  },
605
  {
606
- "epoch": 1.48,
607
- "learning_rate": 0.00012625838926174499,
608
- "loss": 0.0728,
609
  "step": 880
610
  },
611
  {
612
- "epoch": 1.49,
613
- "learning_rate": 0.00012541946308724833,
614
- "loss": 0.1591,
615
  "step": 890
616
  },
617
  {
618
- "epoch": 1.51,
619
- "learning_rate": 0.0001245805369127517,
620
- "loss": 0.0888,
621
  "step": 900
622
  },
623
  {
624
- "epoch": 1.51,
625
- "eval_accuracy": 0.9546599496221663,
626
- "eval_loss": 0.14956098794937134,
627
- "eval_runtime": 37.2049,
628
- "eval_samples_per_second": 64.024,
629
- "eval_steps_per_second": 8.01,
630
  "step": 900
631
  },
632
  {
633
- "epoch": 1.53,
634
- "learning_rate": 0.00012374161073825502,
635
- "loss": 0.0903,
636
  "step": 910
637
  },
638
  {
639
- "epoch": 1.54,
640
- "learning_rate": 0.0001229026845637584,
641
- "loss": 0.0751,
642
  "step": 920
643
  },
644
  {
645
- "epoch": 1.56,
646
- "learning_rate": 0.00012206375838926174,
647
- "loss": 0.1672,
648
  "step": 930
649
  },
650
  {
651
- "epoch": 1.58,
652
- "learning_rate": 0.00012122483221476511,
653
- "loss": 0.0507,
654
  "step": 940
655
  },
656
  {
657
- "epoch": 1.59,
658
- "learning_rate": 0.00012038590604026847,
659
- "loss": 0.1459,
660
  "step": 950
661
  },
662
  {
663
- "epoch": 1.61,
664
- "learning_rate": 0.00011954697986577181,
665
- "loss": 0.0659,
666
  "step": 960
667
  },
668
  {
669
- "epoch": 1.63,
670
- "learning_rate": 0.00011870805369127518,
671
- "loss": 0.0726,
672
  "step": 970
673
  },
674
  {
675
- "epoch": 1.64,
676
- "learning_rate": 0.00011786912751677853,
677
- "loss": 0.0898,
678
  "step": 980
679
  },
680
  {
681
- "epoch": 1.66,
682
- "learning_rate": 0.00011703020134228189,
683
- "loss": 0.1465,
684
  "step": 990
685
  },
686
  {
687
- "epoch": 1.68,
688
- "learning_rate": 0.00011619127516778523,
689
- "loss": 0.1058,
690
  "step": 1000
691
  },
692
  {
693
- "epoch": 1.68,
694
- "eval_accuracy": 0.9748110831234257,
695
- "eval_loss": 0.1099860668182373,
696
- "eval_runtime": 37.5608,
697
- "eval_samples_per_second": 63.417,
698
- "eval_steps_per_second": 7.934,
699
  "step": 1000
700
  },
701
  {
702
- "epoch": 1.69,
703
- "learning_rate": 0.00011535234899328859,
704
- "loss": 0.1159,
705
  "step": 1010
706
  },
707
  {
708
- "epoch": 1.71,
709
- "learning_rate": 0.00011451342281879196,
710
- "loss": 0.0583,
711
  "step": 1020
712
  },
713
  {
714
- "epoch": 1.73,
715
- "learning_rate": 0.0001136744966442953,
716
- "loss": 0.1075,
717
  "step": 1030
718
  },
719
  {
720
- "epoch": 1.74,
721
- "learning_rate": 0.00011283557046979866,
722
- "loss": 0.1809,
723
  "step": 1040
724
  },
725
  {
726
- "epoch": 1.76,
727
- "learning_rate": 0.00011199664429530201,
728
- "loss": 0.1112,
729
  "step": 1050
730
  },
731
  {
732
- "epoch": 1.78,
733
- "learning_rate": 0.00011115771812080538,
734
- "loss": 0.0408,
735
  "step": 1060
736
  },
737
  {
738
- "epoch": 1.8,
739
- "learning_rate": 0.00011031879194630873,
740
- "loss": 0.1428,
741
  "step": 1070
742
  },
743
  {
744
- "epoch": 1.81,
745
- "learning_rate": 0.00010947986577181208,
746
- "loss": 0.0248,
747
  "step": 1080
748
  },
749
  {
750
- "epoch": 1.83,
751
- "learning_rate": 0.00010864093959731546,
752
- "loss": 0.0873,
753
  "step": 1090
754
  },
755
  {
756
- "epoch": 1.85,
757
- "learning_rate": 0.00010780201342281879,
758
- "loss": 0.077,
759
  "step": 1100
760
  },
761
  {
762
- "epoch": 1.85,
763
- "eval_accuracy": 0.9756507136859782,
764
- "eval_loss": 0.07842576503753662,
765
- "eval_runtime": 36.9225,
766
- "eval_samples_per_second": 64.513,
767
- "eval_steps_per_second": 8.071,
768
  "step": 1100
769
  },
770
  {
771
- "epoch": 1.86,
772
- "learning_rate": 0.00010696308724832216,
773
- "loss": 0.1448,
774
  "step": 1110
775
  },
776
  {
777
- "epoch": 1.88,
778
- "learning_rate": 0.0001061241610738255,
779
- "loss": 0.1401,
780
  "step": 1120
781
  },
782
  {
783
- "epoch": 1.9,
784
- "learning_rate": 0.00010528523489932886,
785
- "loss": 0.0186,
786
  "step": 1130
787
  },
788
  {
789
- "epoch": 1.91,
790
- "learning_rate": 0.00010444630872483221,
791
- "loss": 0.0693,
792
  "step": 1140
793
  },
794
  {
795
- "epoch": 1.93,
796
- "learning_rate": 0.00010360738255033558,
797
- "loss": 0.0521,
798
  "step": 1150
799
  },
800
  {
801
- "epoch": 1.95,
802
- "learning_rate": 0.00010276845637583894,
803
- "loss": 0.0286,
804
  "step": 1160
805
  },
806
  {
807
- "epoch": 1.96,
808
- "learning_rate": 0.00010192953020134228,
809
- "loss": 0.0321,
810
  "step": 1170
811
  },
812
  {
813
- "epoch": 1.98,
814
- "learning_rate": 0.00010109060402684565,
815
- "loss": 0.1065,
816
  "step": 1180
817
  },
818
  {
819
- "epoch": 2.0,
820
- "learning_rate": 0.00010025167785234899,
821
- "loss": 0.071,
822
  "step": 1190
823
  },
824
  {
825
- "epoch": 2.01,
826
- "learning_rate": 9.941275167785236e-05,
827
- "loss": 0.0486,
828
  "step": 1200
829
  },
830
  {
831
- "epoch": 2.01,
832
- "eval_accuracy": 0.9710327455919395,
833
- "eval_loss": 0.09972918033599854,
834
- "eval_runtime": 38.6678,
835
- "eval_samples_per_second": 61.602,
836
- "eval_steps_per_second": 7.707,
837
  "step": 1200
838
  },
839
  {
840
- "epoch": 2.03,
841
- "learning_rate": 9.857382550335572e-05,
842
- "loss": 0.04,
843
  "step": 1210
844
  },
845
  {
846
- "epoch": 2.05,
847
- "learning_rate": 9.773489932885906e-05,
848
- "loss": 0.0241,
849
  "step": 1220
850
  },
851
  {
852
- "epoch": 2.06,
853
- "learning_rate": 9.689597315436242e-05,
854
- "loss": 0.0574,
855
  "step": 1230
856
  },
857
  {
858
- "epoch": 2.08,
859
- "learning_rate": 9.605704697986578e-05,
860
- "loss": 0.0196,
861
  "step": 1240
862
  },
863
  {
864
- "epoch": 2.1,
865
- "learning_rate": 9.521812080536914e-05,
866
- "loss": 0.0344,
867
  "step": 1250
868
  },
869
  {
870
- "epoch": 2.11,
871
- "learning_rate": 9.43791946308725e-05,
872
- "loss": 0.0342,
873
  "step": 1260
874
  },
875
  {
876
- "epoch": 2.13,
877
- "learning_rate": 9.354026845637585e-05,
878
- "loss": 0.0121,
879
  "step": 1270
880
  },
881
  {
882
- "epoch": 2.15,
883
- "learning_rate": 9.27013422818792e-05,
884
- "loss": 0.0251,
885
  "step": 1280
886
  },
887
  {
888
- "epoch": 2.16,
889
- "learning_rate": 9.186241610738255e-05,
890
- "loss": 0.0503,
891
  "step": 1290
892
  },
893
  {
894
- "epoch": 2.18,
895
- "learning_rate": 9.102348993288591e-05,
896
- "loss": 0.0087,
897
  "step": 1300
898
  },
899
  {
900
- "epoch": 2.18,
901
- "eval_accuracy": 0.9811083123425692,
902
- "eval_loss": 0.07042522728443146,
903
- "eval_runtime": 37.2902,
904
- "eval_samples_per_second": 63.877,
905
- "eval_steps_per_second": 7.991,
906
  "step": 1300
907
  },
908
  {
909
- "epoch": 2.2,
910
- "learning_rate": 9.018456375838926e-05,
911
- "loss": 0.0676,
912
  "step": 1310
913
  },
914
  {
915
- "epoch": 2.21,
916
- "learning_rate": 8.934563758389262e-05,
917
- "loss": 0.0445,
918
  "step": 1320
919
  },
920
  {
921
- "epoch": 2.23,
922
- "learning_rate": 8.850671140939599e-05,
923
- "loss": 0.0157,
924
  "step": 1330
925
  },
926
  {
927
- "epoch": 2.25,
928
- "learning_rate": 8.766778523489933e-05,
929
- "loss": 0.045,
930
  "step": 1340
931
  },
932
  {
933
- "epoch": 2.27,
934
- "learning_rate": 8.682885906040269e-05,
935
- "loss": 0.0071,
936
  "step": 1350
937
  },
938
  {
939
- "epoch": 2.28,
940
- "learning_rate": 8.598993288590605e-05,
941
- "loss": 0.0219,
942
  "step": 1360
943
  },
944
  {
945
- "epoch": 2.3,
946
- "learning_rate": 8.51510067114094e-05,
947
- "loss": 0.0201,
948
  "step": 1370
949
  },
950
  {
951
- "epoch": 2.32,
952
- "learning_rate": 8.431208053691275e-05,
953
- "loss": 0.0097,
954
  "step": 1380
955
  },
956
  {
957
- "epoch": 2.33,
958
- "learning_rate": 8.347315436241611e-05,
959
- "loss": 0.0552,
960
  "step": 1390
961
  },
962
  {
963
- "epoch": 2.35,
964
- "learning_rate": 8.263422818791947e-05,
965
- "loss": 0.0382,
966
  "step": 1400
967
  },
968
  {
969
- "epoch": 2.35,
970
- "eval_accuracy": 0.9760705289672544,
971
- "eval_loss": 0.09097278863191605,
972
- "eval_runtime": 38.1785,
973
- "eval_samples_per_second": 62.391,
974
- "eval_steps_per_second": 7.805,
975
  "step": 1400
976
  },
977
  {
978
- "epoch": 2.37,
979
- "learning_rate": 8.179530201342283e-05,
980
- "loss": 0.0136,
981
  "step": 1410
982
  },
983
  {
984
- "epoch": 2.38,
985
- "learning_rate": 8.095637583892619e-05,
986
- "loss": 0.0735,
987
  "step": 1420
988
  },
989
  {
990
- "epoch": 2.4,
991
- "learning_rate": 8.011744966442953e-05,
992
- "loss": 0.0244,
993
  "step": 1430
994
  },
995
  {
996
- "epoch": 2.42,
997
- "learning_rate": 7.927852348993289e-05,
998
- "loss": 0.0208,
999
  "step": 1440
1000
  },
1001
  {
1002
- "epoch": 2.43,
1003
- "learning_rate": 7.843959731543625e-05,
1004
- "loss": 0.0289,
1005
  "step": 1450
1006
  },
1007
  {
1008
- "epoch": 2.45,
1009
- "learning_rate": 7.760067114093959e-05,
1010
- "loss": 0.023,
1011
  "step": 1460
1012
  },
1013
  {
1014
- "epoch": 2.47,
1015
- "learning_rate": 7.676174496644296e-05,
1016
- "loss": 0.006,
1017
  "step": 1470
1018
  },
1019
  {
1020
- "epoch": 2.48,
1021
- "learning_rate": 7.592281879194631e-05,
1022
- "loss": 0.0415,
1023
  "step": 1480
1024
  },
1025
  {
1026
- "epoch": 2.5,
1027
- "learning_rate": 7.508389261744967e-05,
1028
- "loss": 0.0253,
1029
  "step": 1490
1030
  },
1031
  {
1032
- "epoch": 2.52,
1033
- "learning_rate": 7.424496644295303e-05,
1034
- "loss": 0.046,
1035
  "step": 1500
1036
  },
1037
  {
1038
- "epoch": 2.52,
1039
- "eval_accuracy": 0.984047019311503,
1040
- "eval_loss": 0.05878419801592827,
1041
- "eval_runtime": 37.7397,
1042
- "eval_samples_per_second": 63.116,
1043
- "eval_steps_per_second": 7.896,
1044
  "step": 1500
1045
  },
1046
  {
1047
- "epoch": 2.53,
1048
- "learning_rate": 7.340604026845638e-05,
1049
- "loss": 0.0409,
1050
  "step": 1510
1051
  },
1052
  {
1053
- "epoch": 2.55,
1054
- "learning_rate": 7.256711409395973e-05,
1055
- "loss": 0.0335,
1056
  "step": 1520
1057
  },
1058
  {
1059
- "epoch": 2.57,
1060
- "learning_rate": 7.172818791946309e-05,
1061
- "loss": 0.0049,
1062
  "step": 1530
1063
  },
1064
  {
1065
- "epoch": 2.58,
1066
- "learning_rate": 7.088926174496645e-05,
1067
- "loss": 0.0119,
1068
  "step": 1540
1069
  },
1070
  {
1071
- "epoch": 2.6,
1072
- "learning_rate": 7.00503355704698e-05,
1073
- "loss": 0.0188,
1074
  "step": 1550
1075
  },
1076
  {
1077
- "epoch": 2.62,
1078
- "learning_rate": 6.921140939597316e-05,
1079
- "loss": 0.0047,
1080
  "step": 1560
1081
  },
1082
  {
1083
- "epoch": 2.63,
1084
- "learning_rate": 6.83724832214765e-05,
1085
- "loss": 0.0053,
1086
  "step": 1570
1087
  },
1088
  {
1089
- "epoch": 2.65,
1090
- "learning_rate": 6.753355704697986e-05,
1091
- "loss": 0.0034,
1092
  "step": 1580
1093
  },
1094
  {
1095
- "epoch": 2.67,
1096
- "learning_rate": 6.669463087248322e-05,
1097
- "loss": 0.0026,
1098
  "step": 1590
1099
  },
1100
  {
1101
- "epoch": 2.68,
1102
- "learning_rate": 6.585570469798658e-05,
1103
- "loss": 0.0163,
1104
  "step": 1600
1105
  },
1106
  {
1107
- "epoch": 2.68,
1108
- "eval_accuracy": 0.9836272040302267,
1109
- "eval_loss": 0.06088751554489136,
1110
- "eval_runtime": 38.2646,
1111
- "eval_samples_per_second": 62.251,
1112
- "eval_steps_per_second": 7.788,
1113
  "step": 1600
1114
  },
1115
  {
1116
- "epoch": 2.7,
1117
- "learning_rate": 6.501677852348994e-05,
1118
- "loss": 0.0029,
1119
  "step": 1610
1120
  },
1121
  {
1122
- "epoch": 2.72,
1123
- "learning_rate": 6.41778523489933e-05,
1124
- "loss": 0.0027,
1125
  "step": 1620
1126
  },
1127
  {
1128
- "epoch": 2.73,
1129
- "learning_rate": 6.333892617449664e-05,
1130
- "loss": 0.0394,
1131
  "step": 1630
1132
  },
1133
  {
1134
- "epoch": 2.75,
1135
- "learning_rate": 6.25e-05,
1136
- "loss": 0.0235,
1137
  "step": 1640
1138
  },
1139
  {
1140
- "epoch": 2.77,
1141
- "learning_rate": 6.166107382550336e-05,
1142
- "loss": 0.0199,
1143
  "step": 1650
1144
  },
1145
  {
1146
- "epoch": 2.79,
1147
- "learning_rate": 6.082214765100671e-05,
1148
- "loss": 0.0037,
1149
  "step": 1660
1150
  },
1151
  {
1152
- "epoch": 2.8,
1153
- "learning_rate": 5.998322147651006e-05,
1154
- "loss": 0.0151,
1155
  "step": 1670
1156
  },
1157
  {
1158
- "epoch": 2.82,
1159
- "learning_rate": 5.914429530201343e-05,
1160
- "loss": 0.0472,
1161
  "step": 1680
1162
  },
1163
  {
1164
- "epoch": 2.84,
1165
- "learning_rate": 5.8305369127516786e-05,
1166
- "loss": 0.0406,
1167
  "step": 1690
1168
  },
1169
  {
1170
- "epoch": 2.85,
1171
- "learning_rate": 5.746644295302014e-05,
1172
- "loss": 0.0876,
1173
  "step": 1700
1174
  },
1175
  {
1176
- "epoch": 2.85,
1177
- "eval_accuracy": 0.9811083123425692,
1178
- "eval_loss": 0.06890641152858734,
1179
- "eval_runtime": 38.0085,
1180
- "eval_samples_per_second": 62.67,
1181
- "eval_steps_per_second": 7.84,
1182
  "step": 1700
1183
  },
1184
  {
1185
- "epoch": 2.87,
1186
- "learning_rate": 5.6627516778523496e-05,
1187
- "loss": 0.0033,
1188
  "step": 1710
1189
  },
1190
  {
1191
- "epoch": 2.89,
1192
- "learning_rate": 5.578859060402685e-05,
1193
- "loss": 0.003,
1194
  "step": 1720
1195
  },
1196
  {
1197
- "epoch": 2.9,
1198
- "learning_rate": 5.49496644295302e-05,
1199
- "loss": 0.0024,
1200
  "step": 1730
1201
  },
1202
  {
1203
- "epoch": 2.92,
1204
- "learning_rate": 5.411073825503356e-05,
1205
- "loss": 0.0122,
1206
  "step": 1740
1207
  },
1208
  {
1209
- "epoch": 2.94,
1210
- "learning_rate": 5.327181208053692e-05,
1211
- "loss": 0.0072,
1212
  "step": 1750
1213
  },
1214
  {
1215
- "epoch": 2.95,
1216
- "learning_rate": 5.2432885906040274e-05,
1217
- "loss": 0.0023,
1218
  "step": 1760
1219
  },
1220
  {
1221
- "epoch": 2.97,
1222
- "learning_rate": 5.1593959731543626e-05,
1223
- "loss": 0.0098,
1224
  "step": 1770
1225
  },
1226
  {
1227
- "epoch": 2.99,
1228
- "learning_rate": 5.0755033557046984e-05,
1229
- "loss": 0.035,
1230
  "step": 1780
1231
  },
1232
  {
1233
- "epoch": 3.0,
1234
- "learning_rate": 4.9916107382550336e-05,
1235
- "loss": 0.0349,
1236
  "step": 1790
1237
  },
1238
  {
1239
- "epoch": 3.02,
1240
- "learning_rate": 4.9077181208053694e-05,
1241
- "loss": 0.0072,
1242
  "step": 1800
1243
  },
1244
  {
1245
- "epoch": 3.02,
1246
- "eval_accuracy": 0.9861460957178841,
1247
- "eval_loss": 0.06315400451421738,
1248
- "eval_runtime": 38.6086,
1249
- "eval_samples_per_second": 61.696,
1250
- "eval_steps_per_second": 7.718,
1251
  "step": 1800
1252
  },
1253
  {
1254
- "epoch": 3.04,
1255
- "learning_rate": 4.823825503355705e-05,
1256
- "loss": 0.0022,
1257
  "step": 1810
1258
  },
1259
  {
1260
- "epoch": 3.05,
1261
- "learning_rate": 4.7399328859060404e-05,
1262
- "loss": 0.0131,
1263
  "step": 1820
1264
  },
1265
  {
1266
- "epoch": 3.07,
1267
- "learning_rate": 4.6560402684563755e-05,
1268
- "loss": 0.0031,
1269
  "step": 1830
1270
  },
1271
  {
1272
- "epoch": 3.09,
1273
- "learning_rate": 4.572147651006712e-05,
1274
- "loss": 0.0024,
1275
  "step": 1840
1276
  },
1277
  {
1278
- "epoch": 3.1,
1279
- "learning_rate": 4.488255033557047e-05,
1280
- "loss": 0.0022,
1281
  "step": 1850
1282
  },
1283
  {
1284
- "epoch": 3.12,
1285
- "learning_rate": 4.4043624161073823e-05,
1286
- "loss": 0.0029,
1287
  "step": 1860
1288
  },
1289
  {
1290
- "epoch": 3.14,
1291
- "learning_rate": 4.320469798657718e-05,
1292
- "loss": 0.0018,
1293
  "step": 1870
1294
  },
1295
  {
1296
- "epoch": 3.15,
1297
- "learning_rate": 4.236577181208054e-05,
1298
- "loss": 0.0019,
1299
  "step": 1880
1300
  },
1301
  {
1302
- "epoch": 3.17,
1303
- "learning_rate": 4.152684563758389e-05,
1304
- "loss": 0.0046,
1305
  "step": 1890
1306
  },
1307
  {
1308
- "epoch": 3.19,
1309
- "learning_rate": 4.068791946308725e-05,
1310
- "loss": 0.0022,
1311
  "step": 1900
1312
  },
1313
  {
1314
- "epoch": 3.19,
1315
- "eval_accuracy": 0.9844668345927792,
1316
- "eval_loss": 0.0669856071472168,
1317
- "eval_runtime": 38.9734,
1318
- "eval_samples_per_second": 61.119,
1319
- "eval_steps_per_second": 7.646,
1320
  "step": 1900
1321
  },
1322
  {
1323
- "epoch": 3.2,
1324
- "learning_rate": 3.984899328859061e-05,
1325
- "loss": 0.0031,
1326
  "step": 1910
1327
  },
1328
  {
1329
- "epoch": 3.22,
1330
- "learning_rate": 3.901006711409396e-05,
1331
- "loss": 0.0033,
1332
  "step": 1920
1333
  },
1334
  {
1335
- "epoch": 3.24,
1336
- "learning_rate": 3.817114093959732e-05,
1337
- "loss": 0.0171,
1338
  "step": 1930
1339
  },
1340
  {
1341
- "epoch": 3.26,
1342
- "learning_rate": 3.733221476510067e-05,
1343
- "loss": 0.0022,
1344
  "step": 1940
1345
  },
1346
  {
1347
- "epoch": 3.27,
1348
- "learning_rate": 3.649328859060403e-05,
1349
- "loss": 0.0023,
1350
  "step": 1950
1351
  },
1352
  {
1353
- "epoch": 3.29,
1354
- "learning_rate": 3.5654362416107386e-05,
1355
- "loss": 0.0036,
1356
  "step": 1960
1357
  },
1358
  {
1359
- "epoch": 3.31,
1360
- "learning_rate": 3.481543624161074e-05,
1361
- "loss": 0.0038,
1362
  "step": 1970
1363
  },
1364
  {
1365
- "epoch": 3.32,
1366
- "learning_rate": 3.3976510067114096e-05,
1367
- "loss": 0.0016,
1368
  "step": 1980
1369
  },
1370
  {
1371
- "epoch": 3.34,
1372
- "learning_rate": 3.3137583892617455e-05,
1373
- "loss": 0.0022,
1374
  "step": 1990
1375
  },
1376
  {
1377
- "epoch": 3.36,
1378
- "learning_rate": 3.2298657718120806e-05,
1379
- "loss": 0.0018,
1380
  "step": 2000
1381
  },
1382
  {
1383
- "epoch": 3.36,
1384
- "eval_accuracy": 0.9844668345927792,
1385
- "eval_loss": 0.0704568475484848,
1386
- "eval_runtime": 36.8214,
1387
- "eval_samples_per_second": 64.691,
1388
- "eval_steps_per_second": 8.093,
1389
  "step": 2000
1390
  },
1391
  {
1392
- "epoch": 3.37,
1393
- "learning_rate": 3.145973154362416e-05,
1394
- "loss": 0.0036,
1395
  "step": 2010
1396
  },
1397
  {
1398
- "epoch": 3.39,
1399
- "learning_rate": 3.062080536912752e-05,
1400
- "loss": 0.0018,
1401
  "step": 2020
1402
  },
1403
  {
1404
- "epoch": 3.41,
1405
- "learning_rate": 2.9781879194630874e-05,
1406
- "loss": 0.0015,
1407
  "step": 2030
1408
  },
1409
  {
1410
- "epoch": 3.42,
1411
- "learning_rate": 2.894295302013423e-05,
1412
- "loss": 0.0018,
1413
  "step": 2040
1414
  },
1415
  {
1416
- "epoch": 3.44,
1417
- "learning_rate": 2.8104026845637588e-05,
1418
- "loss": 0.0021,
1419
  "step": 2050
1420
  },
1421
  {
1422
- "epoch": 3.46,
1423
- "learning_rate": 2.7265100671140943e-05,
1424
- "loss": 0.0016,
1425
  "step": 2060
1426
  },
1427
  {
1428
- "epoch": 3.47,
1429
- "learning_rate": 2.6426174496644297e-05,
1430
- "loss": 0.0019,
1431
  "step": 2070
1432
  },
1433
  {
1434
- "epoch": 3.49,
1435
- "learning_rate": 2.558724832214765e-05,
1436
- "loss": 0.0025,
1437
  "step": 2080
1438
  },
1439
  {
1440
- "epoch": 3.51,
1441
- "learning_rate": 2.4748322147651007e-05,
1442
- "loss": 0.0071,
1443
  "step": 2090
1444
  },
1445
  {
1446
- "epoch": 3.52,
1447
- "learning_rate": 2.3909395973154362e-05,
1448
- "loss": 0.0023,
1449
  "step": 2100
1450
  },
1451
  {
1452
- "epoch": 3.52,
1453
- "eval_accuracy": 0.9861460957178841,
1454
- "eval_loss": 0.0588483065366745,
1455
- "eval_runtime": 36.7062,
1456
- "eval_samples_per_second": 64.894,
1457
- "eval_steps_per_second": 8.119,
1458
  "step": 2100
1459
  },
1460
  {
1461
- "epoch": 3.54,
1462
- "learning_rate": 2.3070469798657717e-05,
1463
- "loss": 0.0111,
1464
  "step": 2110
1465
  },
1466
  {
1467
- "epoch": 3.56,
1468
- "learning_rate": 2.2231543624161076e-05,
1469
- "loss": 0.0019,
1470
  "step": 2120
1471
  },
1472
  {
1473
- "epoch": 3.57,
1474
- "learning_rate": 2.139261744966443e-05,
1475
- "loss": 0.007,
1476
  "step": 2130
1477
  },
1478
  {
1479
- "epoch": 3.59,
1480
- "learning_rate": 2.0553691275167785e-05,
1481
- "loss": 0.0022,
1482
  "step": 2140
1483
  },
1484
  {
1485
- "epoch": 3.61,
1486
- "learning_rate": 1.9714765100671144e-05,
1487
- "loss": 0.0019,
1488
  "step": 2150
1489
  },
1490
  {
1491
- "epoch": 3.62,
1492
- "learning_rate": 1.8875838926174495e-05,
1493
- "loss": 0.0017,
1494
  "step": 2160
1495
  },
1496
  {
1497
- "epoch": 3.64,
1498
- "learning_rate": 1.8036912751677854e-05,
1499
- "loss": 0.0017,
1500
  "step": 2170
1501
  },
1502
  {
1503
- "epoch": 3.66,
1504
- "learning_rate": 1.719798657718121e-05,
1505
- "loss": 0.0134,
1506
  "step": 2180
1507
  },
1508
  {
1509
- "epoch": 3.67,
1510
- "learning_rate": 1.6359060402684563e-05,
1511
- "loss": 0.0017,
1512
  "step": 2190
1513
  },
1514
  {
1515
- "epoch": 3.69,
1516
- "learning_rate": 1.5520134228187922e-05,
1517
- "loss": 0.0017,
1518
  "step": 2200
1519
  },
1520
  {
1521
- "epoch": 3.69,
1522
- "eval_accuracy": 0.9882451721242653,
1523
- "eval_loss": 0.04898680001497269,
1524
- "eval_runtime": 36.2766,
1525
- "eval_samples_per_second": 65.662,
1526
- "eval_steps_per_second": 8.215,
1527
  "step": 2200
1528
  },
1529
  {
1530
- "epoch": 3.71,
1531
- "learning_rate": 1.4681208053691275e-05,
1532
- "loss": 0.002,
1533
  "step": 2210
1534
  },
1535
  {
1536
- "epoch": 3.72,
1537
- "learning_rate": 1.3842281879194632e-05,
1538
- "loss": 0.0019,
1539
  "step": 2220
1540
  },
1541
  {
1542
- "epoch": 3.74,
1543
- "learning_rate": 1.3003355704697987e-05,
1544
- "loss": 0.0017,
1545
  "step": 2230
1546
  },
1547
  {
1548
- "epoch": 3.76,
1549
- "learning_rate": 1.2164429530201343e-05,
1550
- "loss": 0.0021,
1551
  "step": 2240
1552
  },
1553
  {
1554
- "epoch": 3.78,
1555
- "learning_rate": 1.1325503355704698e-05,
1556
- "loss": 0.0027,
1557
  "step": 2250
1558
  },
1559
  {
1560
- "epoch": 3.79,
1561
- "learning_rate": 1.0486577181208055e-05,
1562
- "loss": 0.0027,
1563
  "step": 2260
1564
  },
1565
  {
1566
- "epoch": 3.81,
1567
- "learning_rate": 9.64765100671141e-06,
1568
- "loss": 0.0021,
1569
  "step": 2270
1570
  },
1571
  {
1572
- "epoch": 3.83,
1573
- "learning_rate": 8.808724832214765e-06,
1574
- "loss": 0.0017,
1575
  "step": 2280
1576
  },
1577
  {
1578
- "epoch": 3.84,
1579
- "learning_rate": 7.969798657718121e-06,
1580
- "loss": 0.0018,
1581
  "step": 2290
1582
  },
1583
  {
1584
- "epoch": 3.86,
1585
- "learning_rate": 7.130872483221476e-06,
1586
- "loss": 0.0248,
1587
  "step": 2300
1588
  },
1589
  {
1590
- "epoch": 3.86,
1591
- "eval_accuracy": 0.9882451721242653,
1592
- "eval_loss": 0.04982466623187065,
1593
- "eval_runtime": 37.2392,
1594
- "eval_samples_per_second": 63.965,
1595
- "eval_steps_per_second": 8.002,
1596
  "step": 2300
1597
  },
1598
  {
1599
- "epoch": 3.88,
1600
- "learning_rate": 6.291946308724833e-06,
1601
- "loss": 0.0024,
1602
  "step": 2310
1603
  },
1604
  {
1605
- "epoch": 3.89,
1606
- "learning_rate": 5.453020134228188e-06,
1607
- "loss": 0.0017,
1608
  "step": 2320
1609
  },
1610
  {
1611
- "epoch": 3.91,
1612
- "learning_rate": 4.6140939597315445e-06,
1613
- "loss": 0.0315,
1614
  "step": 2330
1615
  },
1616
  {
1617
- "epoch": 3.93,
1618
- "learning_rate": 3.7751677852349e-06,
1619
- "loss": 0.0018,
1620
  "step": 2340
1621
  },
1622
  {
1623
- "epoch": 3.94,
1624
- "learning_rate": 2.936241610738255e-06,
1625
- "loss": 0.0018,
1626
  "step": 2350
1627
  },
1628
  {
1629
- "epoch": 3.96,
1630
- "learning_rate": 2.097315436241611e-06,
1631
- "loss": 0.0016,
1632
  "step": 2360
1633
  },
1634
  {
1635
- "epoch": 3.98,
1636
- "learning_rate": 1.2583892617449663e-06,
1637
- "loss": 0.0027,
1638
  "step": 2370
1639
  },
1640
  {
1641
- "epoch": 3.99,
1642
- "learning_rate": 4.1946308724832216e-07,
1643
- "loss": 0.0133,
1644
  "step": 2380
1645
  },
1646
  {
1647
- "epoch": 4.0,
1648
- "step": 2384,
1649
- "total_flos": 2.953718820314874e+18,
1650
- "train_loss": 0.09724263496534437,
1651
- "train_runtime": 2056.7864,
1652
- "train_samples_per_second": 18.53,
1653
- "train_steps_per_second": 1.159
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1654
  }
1655
  ],
1656
  "logging_steps": 10,
1657
- "max_steps": 2384,
1658
- "num_train_epochs": 4,
1659
  "save_steps": 100,
1660
- "total_flos": 2.953718820314874e+18,
1661
  "trial_name": null,
1662
  "trial_params": null
1663
  }
 
1
  {
2
+ "best_metric": 0.03492419794201851,
3
+ "best_model_checkpoint": "./vit-base-beans-demo-v5/checkpoint-2800",
4
+ "epoch": 6.0,
5
  "eval_steps": 100,
6
+ "global_step": 4020,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.01,
13
+ "learning_rate": 9.975124378109453e-05,
14
+ "loss": 2.5776,
15
  "step": 10
16
  },
17
  {
18
  "epoch": 0.03,
19
+ "learning_rate": 9.950248756218906e-05,
20
+ "loss": 2.217,
21
  "step": 20
22
  },
23
  {
24
+ "epoch": 0.04,
25
+ "learning_rate": 9.925373134328359e-05,
26
+ "loss": 1.9279,
27
  "step": 30
28
  },
29
  {
30
+ "epoch": 0.06,
31
+ "learning_rate": 9.900497512437812e-05,
32
+ "loss": 1.6945,
33
  "step": 40
34
  },
35
  {
36
+ "epoch": 0.07,
37
+ "learning_rate": 9.875621890547265e-05,
38
+ "loss": 1.4931,
39
  "step": 50
40
  },
41
  {
42
+ "epoch": 0.09,
43
+ "learning_rate": 9.850746268656717e-05,
44
+ "loss": 1.2644,
45
  "step": 60
46
  },
47
  {
48
+ "epoch": 0.1,
49
+ "learning_rate": 9.82587064676617e-05,
50
+ "loss": 1.1095,
51
  "step": 70
52
  },
53
  {
54
+ "epoch": 0.12,
55
+ "learning_rate": 9.800995024875622e-05,
56
+ "loss": 1.0658,
57
  "step": 80
58
  },
59
  {
60
+ "epoch": 0.13,
61
+ "learning_rate": 9.776119402985075e-05,
62
+ "loss": 0.8875,
63
  "step": 90
64
  },
65
  {
66
+ "epoch": 0.15,
67
+ "learning_rate": 9.751243781094528e-05,
68
+ "loss": 0.8306,
69
  "step": 100
70
  },
71
  {
72
+ "epoch": 0.15,
73
+ "eval_accuracy": 0.90848026868178,
74
+ "eval_loss": 0.791517972946167,
75
+ "eval_runtime": 20.4511,
76
+ "eval_samples_per_second": 58.237,
77
+ "eval_steps_per_second": 7.286,
78
  "step": 100
79
  },
80
  {
81
+ "epoch": 0.16,
82
+ "learning_rate": 9.72636815920398e-05,
83
+ "loss": 0.7402,
84
  "step": 110
85
  },
86
  {
87
+ "epoch": 0.18,
88
+ "learning_rate": 9.701492537313434e-05,
89
+ "loss": 0.7802,
90
  "step": 120
91
  },
92
  {
93
+ "epoch": 0.19,
94
+ "learning_rate": 9.676616915422886e-05,
95
+ "loss": 0.6563,
96
  "step": 130
97
  },
98
  {
99
+ "epoch": 0.21,
100
+ "learning_rate": 9.65174129353234e-05,
101
+ "loss": 0.5579,
102
  "step": 140
103
  },
104
  {
105
+ "epoch": 0.22,
106
+ "learning_rate": 9.626865671641792e-05,
107
+ "loss": 0.6564,
108
  "step": 150
109
  },
110
  {
111
+ "epoch": 0.24,
112
+ "learning_rate": 9.601990049751244e-05,
113
+ "loss": 0.5115,
114
  "step": 160
115
  },
116
  {
117
+ "epoch": 0.25,
118
+ "learning_rate": 9.577114427860697e-05,
119
+ "loss": 0.5087,
120
  "step": 170
121
  },
122
  {
123
+ "epoch": 0.27,
124
+ "learning_rate": 9.552238805970149e-05,
125
+ "loss": 0.4821,
126
  "step": 180
127
  },
128
  {
129
+ "epoch": 0.28,
130
+ "learning_rate": 9.527363184079603e-05,
131
+ "loss": 0.3972,
132
  "step": 190
133
  },
134
  {
135
+ "epoch": 0.3,
136
+ "learning_rate": 9.502487562189055e-05,
137
+ "loss": 0.3828,
138
  "step": 200
139
  },
140
  {
141
+ "epoch": 0.3,
142
+ "eval_accuracy": 0.9269521410579346,
143
+ "eval_loss": 0.42952126264572144,
144
+ "eval_runtime": 18.5768,
145
+ "eval_samples_per_second": 64.112,
146
+ "eval_steps_per_second": 8.021,
147
  "step": 200
148
  },
149
  {
150
+ "epoch": 0.31,
151
+ "learning_rate": 9.477611940298507e-05,
152
+ "loss": 0.4261,
153
  "step": 210
154
  },
155
  {
156
+ "epoch": 0.33,
157
+ "learning_rate": 9.452736318407961e-05,
158
+ "loss": 0.3445,
159
  "step": 220
160
  },
161
  {
162
+ "epoch": 0.34,
163
+ "learning_rate": 9.427860696517413e-05,
164
+ "loss": 0.3152,
165
  "step": 230
166
  },
167
  {
168
+ "epoch": 0.36,
169
+ "learning_rate": 9.402985074626867e-05,
170
+ "loss": 0.3448,
171
  "step": 240
172
  },
173
  {
174
+ "epoch": 0.37,
175
+ "learning_rate": 9.37810945273632e-05,
176
+ "loss": 0.4312,
177
  "step": 250
178
  },
179
  {
180
+ "epoch": 0.39,
181
+ "learning_rate": 9.353233830845772e-05,
182
+ "loss": 0.3472,
183
  "step": 260
184
  },
185
  {
186
+ "epoch": 0.4,
187
+ "learning_rate": 9.328358208955224e-05,
188
+ "loss": 0.4248,
189
  "step": 270
190
  },
191
  {
192
+ "epoch": 0.42,
193
+ "learning_rate": 9.303482587064676e-05,
194
+ "loss": 0.3977,
195
  "step": 280
196
  },
197
  {
198
+ "epoch": 0.43,
199
+ "learning_rate": 9.27860696517413e-05,
200
+ "loss": 0.3164,
201
  "step": 290
202
  },
203
  {
204
+ "epoch": 0.45,
205
+ "learning_rate": 9.253731343283582e-05,
206
+ "loss": 0.3623,
207
  "step": 300
208
  },
209
  {
210
+ "epoch": 0.45,
211
+ "eval_accuracy": 0.9370277078085643,
212
+ "eval_loss": 0.30219167470932007,
213
+ "eval_runtime": 23.4722,
214
+ "eval_samples_per_second": 50.741,
215
+ "eval_steps_per_second": 6.348,
216
  "step": 300
217
  },
218
  {
219
+ "epoch": 0.46,
220
+ "learning_rate": 9.228855721393035e-05,
221
+ "loss": 0.2591,
222
  "step": 310
223
  },
224
  {
225
+ "epoch": 0.48,
226
+ "learning_rate": 9.203980099502488e-05,
227
+ "loss": 0.277,
228
  "step": 320
229
  },
230
  {
231
+ "epoch": 0.49,
232
+ "learning_rate": 9.17910447761194e-05,
233
+ "loss": 0.2457,
234
  "step": 330
235
  },
236
  {
237
+ "epoch": 0.51,
238
+ "learning_rate": 9.154228855721394e-05,
239
+ "loss": 0.1788,
240
  "step": 340
241
  },
242
  {
243
+ "epoch": 0.52,
244
+ "learning_rate": 9.129353233830847e-05,
245
+ "loss": 0.2828,
246
  "step": 350
247
  },
248
  {
249
+ "epoch": 0.54,
250
+ "learning_rate": 9.104477611940299e-05,
251
+ "loss": 0.2579,
252
  "step": 360
253
  },
254
  {
255
+ "epoch": 0.55,
256
+ "learning_rate": 9.079601990049753e-05,
257
+ "loss": 0.2195,
258
  "step": 370
259
  },
260
  {
261
+ "epoch": 0.57,
262
+ "learning_rate": 9.054726368159205e-05,
263
+ "loss": 0.2773,
264
  "step": 380
265
  },
266
  {
267
+ "epoch": 0.58,
268
+ "learning_rate": 9.029850746268657e-05,
269
+ "loss": 0.188,
270
  "step": 390
271
  },
272
  {
273
+ "epoch": 0.6,
274
+ "learning_rate": 9.00497512437811e-05,
275
+ "loss": 0.2623,
276
  "step": 400
277
  },
278
  {
279
+ "epoch": 0.6,
280
+ "eval_accuracy": 0.9269521410579346,
281
+ "eval_loss": 0.2882852256298065,
282
+ "eval_runtime": 19.7432,
283
+ "eval_samples_per_second": 60.325,
284
+ "eval_steps_per_second": 7.547,
285
  "step": 400
286
  },
287
  {
288
+ "epoch": 0.61,
289
+ "learning_rate": 8.980099502487562e-05,
290
+ "loss": 0.2402,
291
  "step": 410
292
  },
293
  {
294
+ "epoch": 0.63,
295
+ "learning_rate": 8.955223880597016e-05,
296
+ "loss": 0.1562,
297
  "step": 420
298
  },
299
  {
300
+ "epoch": 0.64,
301
+ "learning_rate": 8.930348258706468e-05,
302
+ "loss": 0.3121,
303
  "step": 430
304
  },
305
  {
306
+ "epoch": 0.66,
307
+ "learning_rate": 8.905472636815922e-05,
308
+ "loss": 0.2505,
309
  "step": 440
310
  },
311
  {
312
+ "epoch": 0.67,
313
+ "learning_rate": 8.880597014925374e-05,
314
+ "loss": 0.2329,
315
  "step": 450
316
  },
317
  {
318
+ "epoch": 0.69,
319
+ "learning_rate": 8.855721393034826e-05,
320
+ "loss": 0.1912,
321
  "step": 460
322
  },
323
  {
324
+ "epoch": 0.7,
325
+ "learning_rate": 8.83084577114428e-05,
326
+ "loss": 0.2406,
327
  "step": 470
328
  },
329
  {
330
+ "epoch": 0.72,
331
+ "learning_rate": 8.805970149253732e-05,
332
+ "loss": 0.1853,
333
  "step": 480
334
  },
335
  {
336
+ "epoch": 0.73,
337
+ "learning_rate": 8.781094527363185e-05,
338
+ "loss": 0.2545,
339
  "step": 490
340
  },
341
  {
342
+ "epoch": 0.75,
343
+ "learning_rate": 8.756218905472637e-05,
344
+ "loss": 0.2251,
345
  "step": 500
346
  },
347
  {
348
+ "epoch": 0.75,
349
+ "eval_accuracy": 0.9462636439966414,
350
+ "eval_loss": 0.23029384016990662,
351
+ "eval_runtime": 20.0888,
352
+ "eval_samples_per_second": 59.287,
353
+ "eval_steps_per_second": 7.417,
354
  "step": 500
355
  },
356
  {
357
+ "epoch": 0.76,
358
+ "learning_rate": 8.731343283582089e-05,
359
+ "loss": 0.1969,
360
  "step": 510
361
  },
362
  {
363
+ "epoch": 0.78,
364
+ "learning_rate": 8.706467661691543e-05,
365
+ "loss": 0.1476,
366
  "step": 520
367
  },
368
  {
369
+ "epoch": 0.79,
370
+ "learning_rate": 8.681592039800995e-05,
371
+ "loss": 0.2066,
372
  "step": 530
373
  },
374
  {
375
+ "epoch": 0.81,
376
+ "learning_rate": 8.656716417910447e-05,
377
+ "loss": 0.3319,
378
  "step": 540
379
  },
380
  {
381
+ "epoch": 0.82,
382
+ "learning_rate": 8.631840796019901e-05,
383
+ "loss": 0.1591,
384
  "step": 550
385
  },
386
  {
387
+ "epoch": 0.84,
388
+ "learning_rate": 8.606965174129353e-05,
389
+ "loss": 0.1666,
390
  "step": 560
391
  },
392
  {
393
+ "epoch": 0.85,
394
+ "learning_rate": 8.582089552238807e-05,
395
+ "loss": 0.2891,
396
  "step": 570
397
  },
398
  {
399
+ "epoch": 0.87,
400
+ "learning_rate": 8.55721393034826e-05,
401
+ "loss": 0.178,
402
  "step": 580
403
  },
404
  {
405
+ "epoch": 0.88,
406
+ "learning_rate": 8.532338308457712e-05,
407
+ "loss": 0.1507,
408
  "step": 590
409
  },
410
  {
411
+ "epoch": 0.9,
412
+ "learning_rate": 8.507462686567164e-05,
413
+ "loss": 0.2695,
414
  "step": 600
415
  },
416
  {
417
+ "epoch": 0.9,
418
+ "eval_accuracy": 0.9378673383711167,
419
+ "eval_loss": 0.2464931756258011,
420
+ "eval_runtime": 19.6563,
421
+ "eval_samples_per_second": 60.591,
422
+ "eval_steps_per_second": 7.58,
423
  "step": 600
424
  },
425
  {
426
+ "epoch": 0.91,
427
+ "learning_rate": 8.482587064676616e-05,
428
+ "loss": 0.2417,
429
  "step": 610
430
  },
431
  {
432
+ "epoch": 0.93,
433
+ "learning_rate": 8.45771144278607e-05,
434
+ "loss": 0.2455,
435
  "step": 620
436
  },
437
  {
438
+ "epoch": 0.94,
439
+ "learning_rate": 8.432835820895522e-05,
440
+ "loss": 0.1249,
441
  "step": 630
442
  },
443
  {
444
+ "epoch": 0.96,
445
+ "learning_rate": 8.407960199004975e-05,
446
+ "loss": 0.1533,
447
  "step": 640
448
  },
449
  {
450
+ "epoch": 0.97,
451
+ "learning_rate": 8.383084577114428e-05,
452
+ "loss": 0.1838,
453
  "step": 650
454
  },
455
  {
456
+ "epoch": 0.99,
457
+ "learning_rate": 8.360696517412937e-05,
458
+ "loss": 0.1247,
459
  "step": 660
460
  },
461
  {
462
+ "epoch": 1.0,
463
+ "learning_rate": 8.335820895522388e-05,
464
+ "loss": 0.1583,
465
  "step": 670
466
  },
467
  {
468
+ "epoch": 1.01,
469
+ "learning_rate": 8.310945273631841e-05,
470
+ "loss": 0.1024,
471
  "step": 680
472
  },
473
  {
474
+ "epoch": 1.03,
475
+ "learning_rate": 8.286069651741294e-05,
476
+ "loss": 0.0599,
477
  "step": 690
478
  },
479
  {
480
+ "epoch": 1.04,
481
+ "learning_rate": 8.261194029850746e-05,
482
+ "loss": 0.1256,
483
  "step": 700
484
  },
485
  {
486
+ "epoch": 1.04,
487
+ "eval_accuracy": 0.9714525608732157,
488
+ "eval_loss": 0.12489016354084015,
489
+ "eval_runtime": 18.8609,
490
+ "eval_samples_per_second": 63.146,
491
+ "eval_steps_per_second": 7.9,
492
  "step": 700
493
  },
494
  {
495
+ "epoch": 1.06,
496
+ "learning_rate": 8.2363184079602e-05,
497
+ "loss": 0.0541,
498
  "step": 710
499
  },
500
  {
501
+ "epoch": 1.07,
502
+ "learning_rate": 8.211442786069652e-05,
503
+ "loss": 0.0742,
504
  "step": 720
505
  },
506
  {
507
+ "epoch": 1.09,
508
+ "learning_rate": 8.186567164179106e-05,
509
+ "loss": 0.0975,
510
  "step": 730
511
  },
512
  {
513
+ "epoch": 1.1,
514
+ "learning_rate": 8.161691542288558e-05,
515
+ "loss": 0.1011,
516
  "step": 740
517
  },
518
  {
519
+ "epoch": 1.12,
520
+ "learning_rate": 8.13681592039801e-05,
521
+ "loss": 0.1081,
522
  "step": 750
523
  },
524
  {
525
+ "epoch": 1.13,
526
+ "learning_rate": 8.111940298507464e-05,
527
+ "loss": 0.129,
528
  "step": 760
529
  },
530
  {
531
+ "epoch": 1.15,
532
+ "learning_rate": 8.087064676616916e-05,
533
+ "loss": 0.1595,
534
  "step": 770
535
  },
536
  {
537
+ "epoch": 1.16,
538
+ "learning_rate": 8.062189054726368e-05,
539
+ "loss": 0.163,
540
  "step": 780
541
  },
542
  {
543
+ "epoch": 1.18,
544
+ "learning_rate": 8.037313432835821e-05,
545
+ "loss": 0.1464,
546
  "step": 790
547
  },
548
  {
549
+ "epoch": 1.19,
550
+ "learning_rate": 8.012437810945273e-05,
551
+ "loss": 0.0718,
552
  "step": 800
553
  },
554
  {
555
+ "epoch": 1.19,
556
+ "eval_accuracy": 0.9622166246851386,
557
+ "eval_loss": 0.13346683979034424,
558
+ "eval_runtime": 19.3064,
559
+ "eval_samples_per_second": 61.689,
560
+ "eval_steps_per_second": 7.718,
561
  "step": 800
562
  },
563
  {
564
+ "epoch": 1.21,
565
+ "learning_rate": 7.987562189054727e-05,
566
+ "loss": 0.0517,
567
  "step": 810
568
  },
569
  {
570
+ "epoch": 1.22,
571
+ "learning_rate": 7.962686567164179e-05,
572
+ "loss": 0.1143,
573
  "step": 820
574
  },
575
  {
576
+ "epoch": 1.24,
577
+ "learning_rate": 7.937810945273633e-05,
578
+ "loss": 0.062,
579
  "step": 830
580
  },
581
  {
582
+ "epoch": 1.25,
583
+ "learning_rate": 7.912935323383085e-05,
584
+ "loss": 0.2013,
585
  "step": 840
586
  },
587
  {
588
+ "epoch": 1.27,
589
+ "learning_rate": 7.888059701492537e-05,
590
+ "loss": 0.0995,
591
  "step": 850
592
  },
593
  {
594
+ "epoch": 1.28,
595
+ "learning_rate": 7.863184079601991e-05,
596
+ "loss": 0.1009,
597
  "step": 860
598
  },
599
  {
600
+ "epoch": 1.3,
601
+ "learning_rate": 7.838308457711443e-05,
602
+ "loss": 0.1292,
603
  "step": 870
604
  },
605
  {
606
+ "epoch": 1.31,
607
+ "learning_rate": 7.813432835820896e-05,
608
+ "loss": 0.0848,
609
  "step": 880
610
  },
611
  {
612
+ "epoch": 1.33,
613
+ "learning_rate": 7.788557213930348e-05,
614
+ "loss": 0.0323,
615
  "step": 890
616
  },
617
  {
618
+ "epoch": 1.34,
619
+ "learning_rate": 7.7636815920398e-05,
620
+ "loss": 0.0436,
621
  "step": 900
622
  },
623
  {
624
+ "epoch": 1.34,
625
+ "eval_accuracy": 0.979009235936188,
626
+ "eval_loss": 0.09892778843641281,
627
+ "eval_runtime": 20.325,
628
+ "eval_samples_per_second": 58.598,
629
+ "eval_steps_per_second": 7.331,
630
  "step": 900
631
  },
632
  {
633
+ "epoch": 1.36,
634
+ "learning_rate": 7.738805970149254e-05,
635
+ "loss": 0.0345,
636
  "step": 910
637
  },
638
  {
639
+ "epoch": 1.37,
640
+ "learning_rate": 7.713930348258706e-05,
641
+ "loss": 0.0505,
642
  "step": 920
643
  },
644
  {
645
+ "epoch": 1.39,
646
+ "learning_rate": 7.68905472636816e-05,
647
+ "loss": 0.0464,
648
  "step": 930
649
  },
650
  {
651
+ "epoch": 1.4,
652
+ "learning_rate": 7.664179104477612e-05,
653
+ "loss": 0.0654,
654
  "step": 940
655
  },
656
  {
657
+ "epoch": 1.42,
658
+ "learning_rate": 7.639303482587065e-05,
659
+ "loss": 0.0776,
660
  "step": 950
661
  },
662
  {
663
+ "epoch": 1.43,
664
+ "learning_rate": 7.614427860696518e-05,
665
+ "loss": 0.0396,
666
  "step": 960
667
  },
668
  {
669
+ "epoch": 1.45,
670
+ "learning_rate": 7.589552238805971e-05,
671
+ "loss": 0.0373,
672
  "step": 970
673
  },
674
  {
675
+ "epoch": 1.46,
676
+ "learning_rate": 7.564676616915424e-05,
677
+ "loss": 0.083,
678
  "step": 980
679
  },
680
  {
681
+ "epoch": 1.48,
682
+ "learning_rate": 7.539800995024875e-05,
683
+ "loss": 0.1018,
684
  "step": 990
685
  },
686
  {
687
+ "epoch": 1.49,
688
+ "learning_rate": 7.514925373134328e-05,
689
+ "loss": 0.1681,
690
  "step": 1000
691
  },
692
  {
693
+ "epoch": 1.49,
694
+ "eval_accuracy": 0.9496221662468514,
695
+ "eval_loss": 0.18528999388217926,
696
+ "eval_runtime": 19.3383,
697
+ "eval_samples_per_second": 61.588,
698
+ "eval_steps_per_second": 7.705,
699
  "step": 1000
700
  },
701
  {
702
+ "epoch": 1.51,
703
+ "learning_rate": 7.490049751243781e-05,
704
+ "loss": 0.0607,
705
  "step": 1010
706
  },
707
  {
708
+ "epoch": 1.52,
709
+ "learning_rate": 7.465174129353234e-05,
710
+ "loss": 0.1343,
711
  "step": 1020
712
  },
713
  {
714
+ "epoch": 1.54,
715
+ "learning_rate": 7.440298507462687e-05,
716
+ "loss": 0.1163,
717
  "step": 1030
718
  },
719
  {
720
+ "epoch": 1.55,
721
+ "learning_rate": 7.41542288557214e-05,
722
+ "loss": 0.0628,
723
  "step": 1040
724
  },
725
  {
726
+ "epoch": 1.57,
727
+ "learning_rate": 7.390547263681592e-05,
728
+ "loss": 0.1001,
729
  "step": 1050
730
  },
731
  {
732
+ "epoch": 1.58,
733
+ "learning_rate": 7.365671641791046e-05,
734
+ "loss": 0.0764,
735
  "step": 1060
736
  },
737
  {
738
+ "epoch": 1.6,
739
+ "learning_rate": 7.340796019900498e-05,
740
+ "loss": 0.0281,
741
  "step": 1070
742
  },
743
  {
744
+ "epoch": 1.61,
745
+ "learning_rate": 7.315920398009952e-05,
746
+ "loss": 0.0861,
747
  "step": 1080
748
  },
749
  {
750
+ "epoch": 1.63,
751
+ "learning_rate": 7.291044776119404e-05,
752
+ "loss": 0.0305,
753
  "step": 1090
754
  },
755
  {
756
+ "epoch": 1.64,
757
+ "learning_rate": 7.266169154228856e-05,
758
+ "loss": 0.0771,
759
  "step": 1100
760
  },
761
  {
762
+ "epoch": 1.64,
763
+ "eval_accuracy": 0.964735516372796,
764
+ "eval_loss": 0.14002813398838043,
765
+ "eval_runtime": 19.5,
766
+ "eval_samples_per_second": 61.077,
767
+ "eval_steps_per_second": 7.641,
768
  "step": 1100
769
  },
770
  {
771
+ "epoch": 1.66,
772
+ "learning_rate": 7.241293532338309e-05,
773
+ "loss": 0.1127,
774
  "step": 1110
775
  },
776
  {
777
+ "epoch": 1.67,
778
+ "learning_rate": 7.216417910447761e-05,
779
+ "loss": 0.0412,
780
  "step": 1120
781
  },
782
  {
783
+ "epoch": 1.69,
784
+ "learning_rate": 7.191542288557215e-05,
785
+ "loss": 0.1501,
786
  "step": 1130
787
  },
788
  {
789
+ "epoch": 1.7,
790
+ "learning_rate": 7.166666666666667e-05,
791
+ "loss": 0.1287,
792
  "step": 1140
793
  },
794
  {
795
+ "epoch": 1.72,
796
+ "learning_rate": 7.141791044776119e-05,
797
+ "loss": 0.0528,
798
  "step": 1150
799
  },
800
  {
801
+ "epoch": 1.73,
802
+ "learning_rate": 7.116915422885573e-05,
803
+ "loss": 0.0139,
804
  "step": 1160
805
  },
806
  {
807
+ "epoch": 1.75,
808
+ "learning_rate": 7.092039800995025e-05,
809
+ "loss": 0.0465,
810
  "step": 1170
811
  },
812
  {
813
+ "epoch": 1.76,
814
+ "learning_rate": 7.067164179104479e-05,
815
+ "loss": 0.0412,
816
  "step": 1180
817
  },
818
  {
819
+ "epoch": 1.78,
820
+ "learning_rate": 7.042288557213931e-05,
821
+ "loss": 0.0812,
822
  "step": 1190
823
  },
824
  {
825
+ "epoch": 1.79,
826
+ "learning_rate": 7.017412935323384e-05,
827
+ "loss": 0.0815,
828
  "step": 1200
829
  },
830
  {
831
+ "epoch": 1.79,
832
+ "eval_accuracy": 0.9764903442485307,
833
+ "eval_loss": 0.0969606265425682,
834
+ "eval_runtime": 18.9862,
835
+ "eval_samples_per_second": 62.73,
836
+ "eval_steps_per_second": 7.848,
837
  "step": 1200
838
  },
839
  {
840
+ "epoch": 1.81,
841
+ "learning_rate": 6.992537313432836e-05,
842
+ "loss": 0.0212,
843
  "step": 1210
844
  },
845
  {
846
+ "epoch": 1.82,
847
+ "learning_rate": 6.967661691542288e-05,
848
+ "loss": 0.0576,
849
  "step": 1220
850
  },
851
  {
852
+ "epoch": 1.84,
853
+ "learning_rate": 6.942786069651742e-05,
854
+ "loss": 0.0269,
855
  "step": 1230
856
  },
857
  {
858
+ "epoch": 1.85,
859
+ "learning_rate": 6.917910447761194e-05,
860
+ "loss": 0.0557,
861
  "step": 1240
862
  },
863
  {
864
+ "epoch": 1.87,
865
+ "learning_rate": 6.893034825870647e-05,
866
+ "loss": 0.1284,
867
  "step": 1250
868
  },
869
  {
870
+ "epoch": 1.88,
871
+ "learning_rate": 6.8681592039801e-05,
872
+ "loss": 0.0499,
873
  "step": 1260
874
  },
875
  {
876
+ "epoch": 1.9,
877
+ "learning_rate": 6.843283582089553e-05,
878
+ "loss": 0.114,
879
  "step": 1270
880
  },
881
  {
882
+ "epoch": 1.91,
883
+ "learning_rate": 6.818407960199006e-05,
884
+ "loss": 0.0338,
885
  "step": 1280
886
  },
887
  {
888
+ "epoch": 1.93,
889
+ "learning_rate": 6.793532338308459e-05,
890
+ "loss": 0.1359,
891
  "step": 1290
892
  },
893
  {
894
+ "epoch": 1.94,
895
+ "learning_rate": 6.768656716417911e-05,
896
+ "loss": 0.0203,
897
  "step": 1300
898
  },
899
  {
900
+ "epoch": 1.94,
901
+ "eval_accuracy": 0.964735516372796,
902
+ "eval_loss": 0.14076600968837738,
903
+ "eval_runtime": 19.9477,
904
+ "eval_samples_per_second": 59.706,
905
+ "eval_steps_per_second": 7.47,
906
  "step": 1300
907
  },
908
  {
909
+ "epoch": 1.96,
910
+ "learning_rate": 6.743781094527363e-05,
911
+ "loss": 0.0373,
912
  "step": 1310
913
  },
914
  {
915
+ "epoch": 1.97,
916
+ "learning_rate": 6.718905472636815e-05,
917
+ "loss": 0.0751,
918
  "step": 1320
919
  },
920
  {
921
+ "epoch": 1.99,
922
+ "learning_rate": 6.694029850746268e-05,
923
+ "loss": 0.0923,
924
  "step": 1330
925
  },
926
  {
927
+ "epoch": 2.0,
928
+ "learning_rate": 6.669154228855721e-05,
929
+ "loss": 0.0406,
930
  "step": 1340
931
  },
932
  {
933
+ "epoch": 2.01,
934
+ "learning_rate": 6.644278606965174e-05,
935
+ "loss": 0.0333,
936
  "step": 1350
937
  },
938
  {
939
+ "epoch": 2.03,
940
+ "learning_rate": 6.619402985074627e-05,
941
+ "loss": 0.0674,
942
  "step": 1360
943
  },
944
  {
945
+ "epoch": 2.04,
946
+ "learning_rate": 6.59452736318408e-05,
947
+ "loss": 0.0971,
948
  "step": 1370
949
  },
950
  {
951
+ "epoch": 2.06,
952
+ "learning_rate": 6.569651741293532e-05,
953
+ "loss": 0.1701,
954
  "step": 1380
955
  },
956
  {
957
+ "epoch": 2.07,
958
+ "learning_rate": 6.544776119402986e-05,
959
+ "loss": 0.1362,
960
  "step": 1390
961
  },
962
  {
963
+ "epoch": 2.09,
964
+ "learning_rate": 6.519900497512438e-05,
965
+ "loss": 0.0774,
966
  "step": 1400
967
  },
968
  {
969
+ "epoch": 2.09,
970
+ "eval_accuracy": 0.966414777497901,
971
+ "eval_loss": 0.12388637661933899,
972
+ "eval_runtime": 19.0185,
973
+ "eval_samples_per_second": 62.623,
974
+ "eval_steps_per_second": 7.834,
975
  "step": 1400
976
  },
977
  {
978
+ "epoch": 2.1,
979
+ "learning_rate": 6.495024875621892e-05,
980
+ "loss": 0.0306,
981
  "step": 1410
982
  },
983
  {
984
+ "epoch": 2.12,
985
+ "learning_rate": 6.470149253731344e-05,
986
+ "loss": 0.0421,
987
  "step": 1420
988
  },
989
  {
990
+ "epoch": 2.13,
991
+ "learning_rate": 6.445273631840796e-05,
992
+ "loss": 0.0408,
993
  "step": 1430
994
  },
995
  {
996
+ "epoch": 2.15,
997
+ "learning_rate": 6.420398009950249e-05,
998
+ "loss": 0.0333,
999
  "step": 1440
1000
  },
1001
  {
1002
+ "epoch": 2.16,
1003
+ "learning_rate": 6.395522388059701e-05,
1004
+ "loss": 0.0124,
1005
  "step": 1450
1006
  },
1007
  {
1008
+ "epoch": 2.18,
1009
+ "learning_rate": 6.370646766169155e-05,
1010
+ "loss": 0.0546,
1011
  "step": 1460
1012
  },
1013
  {
1014
+ "epoch": 2.19,
1015
+ "learning_rate": 6.345771144278607e-05,
1016
+ "loss": 0.0088,
1017
  "step": 1470
1018
  },
1019
  {
1020
+ "epoch": 2.21,
1021
+ "learning_rate": 6.32089552238806e-05,
1022
+ "loss": 0.0525,
1023
  "step": 1480
1024
  },
1025
  {
1026
+ "epoch": 2.22,
1027
+ "learning_rate": 6.296019900497513e-05,
1028
+ "loss": 0.0336,
1029
  "step": 1490
1030
  },
1031
  {
1032
+ "epoch": 2.24,
1033
+ "learning_rate": 6.271144278606965e-05,
1034
+ "loss": 0.0723,
1035
  "step": 1500
1036
  },
1037
  {
1038
+ "epoch": 2.24,
1039
+ "eval_accuracy": 0.982367758186398,
1040
+ "eval_loss": 0.06230420619249344,
1041
+ "eval_runtime": 20.5805,
1042
+ "eval_samples_per_second": 57.87,
1043
+ "eval_steps_per_second": 7.24,
1044
  "step": 1500
1045
  },
1046
  {
1047
+ "epoch": 2.25,
1048
+ "learning_rate": 6.246268656716419e-05,
1049
+ "loss": 0.04,
1050
  "step": 1510
1051
  },
1052
  {
1053
+ "epoch": 2.27,
1054
+ "learning_rate": 6.221393034825871e-05,
1055
+ "loss": 0.029,
1056
  "step": 1520
1057
  },
1058
  {
1059
+ "epoch": 2.28,
1060
+ "learning_rate": 6.196517412935324e-05,
1061
+ "loss": 0.0161,
1062
  "step": 1530
1063
  },
1064
  {
1065
+ "epoch": 2.3,
1066
+ "learning_rate": 6.171641791044776e-05,
1067
+ "loss": 0.0307,
1068
  "step": 1540
1069
  },
1070
  {
1071
+ "epoch": 2.31,
1072
+ "learning_rate": 6.146766169154228e-05,
1073
+ "loss": 0.0099,
1074
  "step": 1550
1075
  },
1076
  {
1077
+ "epoch": 2.33,
1078
+ "learning_rate": 6.121890547263682e-05,
1079
+ "loss": 0.0814,
1080
  "step": 1560
1081
  },
1082
  {
1083
+ "epoch": 2.34,
1084
+ "learning_rate": 6.0970149253731343e-05,
1085
+ "loss": 0.052,
1086
  "step": 1570
1087
  },
1088
  {
1089
+ "epoch": 2.36,
1090
+ "learning_rate": 6.0721393034825867e-05,
1091
+ "loss": 0.0437,
1092
  "step": 1580
1093
  },
1094
  {
1095
+ "epoch": 2.37,
1096
+ "learning_rate": 6.04726368159204e-05,
1097
+ "loss": 0.0411,
1098
  "step": 1590
1099
  },
1100
  {
1101
+ "epoch": 2.39,
1102
+ "learning_rate": 6.0223880597014927e-05,
1103
+ "loss": 0.0187,
1104
  "step": 1600
1105
  },
1106
  {
1107
+ "epoch": 2.39,
1108
+ "eval_accuracy": 0.9731318219983207,
1109
+ "eval_loss": 0.0926276370882988,
1110
+ "eval_runtime": 20.5962,
1111
+ "eval_samples_per_second": 57.826,
1112
+ "eval_steps_per_second": 7.234,
1113
  "step": 1600
1114
  },
1115
  {
1116
+ "epoch": 2.4,
1117
+ "learning_rate": 5.9975124378109457e-05,
1118
+ "loss": 0.0374,
1119
  "step": 1610
1120
  },
1121
  {
1122
+ "epoch": 2.42,
1123
+ "learning_rate": 5.972636815920398e-05,
1124
+ "loss": 0.0216,
1125
  "step": 1620
1126
  },
1127
  {
1128
+ "epoch": 2.43,
1129
+ "learning_rate": 5.94776119402985e-05,
1130
+ "loss": 0.0667,
1131
  "step": 1630
1132
  },
1133
  {
1134
+ "epoch": 2.45,
1135
+ "learning_rate": 5.922885572139304e-05,
1136
+ "loss": 0.0691,
1137
  "step": 1640
1138
  },
1139
  {
1140
+ "epoch": 2.46,
1141
+ "learning_rate": 5.898009950248756e-05,
1142
+ "loss": 0.0364,
1143
  "step": 1650
1144
  },
1145
  {
1146
+ "epoch": 2.48,
1147
+ "learning_rate": 5.87313432835821e-05,
1148
+ "loss": 0.0212,
1149
  "step": 1660
1150
  },
1151
  {
1152
+ "epoch": 2.49,
1153
+ "learning_rate": 5.8482587064676616e-05,
1154
+ "loss": 0.0209,
1155
  "step": 1670
1156
  },
1157
  {
1158
+ "epoch": 2.51,
1159
+ "learning_rate": 5.823383084577114e-05,
1160
+ "loss": 0.012,
1161
  "step": 1680
1162
  },
1163
  {
1164
+ "epoch": 2.52,
1165
+ "learning_rate": 5.7985074626865676e-05,
1166
+ "loss": 0.0118,
1167
  "step": 1690
1168
  },
1169
  {
1170
+ "epoch": 2.54,
1171
+ "learning_rate": 5.77363184079602e-05,
1172
+ "loss": 0.055,
1173
  "step": 1700
1174
  },
1175
  {
1176
+ "epoch": 2.54,
1177
+ "eval_accuracy": 0.9756507136859782,
1178
+ "eval_loss": 0.09100380539894104,
1179
+ "eval_runtime": 19.4994,
1180
+ "eval_samples_per_second": 61.079,
1181
+ "eval_steps_per_second": 7.641,
1182
  "step": 1700
1183
  },
1184
  {
1185
+ "epoch": 2.55,
1186
+ "learning_rate": 5.7487562189054736e-05,
1187
+ "loss": 0.0161,
1188
  "step": 1710
1189
  },
1190
  {
1191
+ "epoch": 2.57,
1192
+ "learning_rate": 5.723880597014926e-05,
1193
+ "loss": 0.0071,
1194
  "step": 1720
1195
  },
1196
  {
1197
+ "epoch": 2.58,
1198
+ "learning_rate": 5.699004975124378e-05,
1199
+ "loss": 0.0103,
1200
  "step": 1730
1201
  },
1202
  {
1203
+ "epoch": 2.6,
1204
+ "learning_rate": 5.674129353233831e-05,
1205
+ "loss": 0.0706,
1206
  "step": 1740
1207
  },
1208
  {
1209
+ "epoch": 2.61,
1210
+ "learning_rate": 5.6492537313432836e-05,
1211
+ "loss": 0.0523,
1212
  "step": 1750
1213
  },
1214
  {
1215
+ "epoch": 2.63,
1216
+ "learning_rate": 5.624378109452737e-05,
1217
+ "loss": 0.0853,
1218
  "step": 1760
1219
  },
1220
  {
1221
+ "epoch": 2.64,
1222
+ "learning_rate": 5.5995024875621896e-05,
1223
+ "loss": 0.027,
1224
  "step": 1770
1225
  },
1226
  {
1227
+ "epoch": 2.66,
1228
+ "learning_rate": 5.574626865671642e-05,
1229
+ "loss": 0.0283,
1230
  "step": 1780
1231
  },
1232
  {
1233
+ "epoch": 2.67,
1234
+ "learning_rate": 5.549751243781095e-05,
1235
+ "loss": 0.0135,
1236
  "step": 1790
1237
  },
1238
  {
1239
+ "epoch": 2.69,
1240
+ "learning_rate": 5.524875621890547e-05,
1241
+ "loss": 0.0304,
1242
  "step": 1800
1243
  },
1244
  {
1245
+ "epoch": 2.69,
1246
+ "eval_accuracy": 0.9722921914357683,
1247
+ "eval_loss": 0.12775705754756927,
1248
+ "eval_runtime": 18.9368,
1249
+ "eval_samples_per_second": 62.893,
1250
+ "eval_steps_per_second": 7.868,
1251
  "step": 1800
1252
  },
1253
  {
1254
+ "epoch": 2.7,
1255
+ "learning_rate": 5.500000000000001e-05,
1256
+ "loss": 0.0089,
1257
  "step": 1810
1258
  },
1259
  {
1260
+ "epoch": 2.72,
1261
+ "learning_rate": 5.475124378109453e-05,
1262
+ "loss": 0.0697,
1263
  "step": 1820
1264
  },
1265
  {
1266
+ "epoch": 2.73,
1267
+ "learning_rate": 5.4502487562189055e-05,
1268
+ "loss": 0.0094,
1269
  "step": 1830
1270
  },
1271
  {
1272
+ "epoch": 2.75,
1273
+ "learning_rate": 5.4253731343283585e-05,
1274
+ "loss": 0.0103,
1275
  "step": 1840
1276
  },
1277
  {
1278
+ "epoch": 2.76,
1279
+ "learning_rate": 5.400497512437811e-05,
1280
+ "loss": 0.0399,
1281
  "step": 1850
1282
  },
1283
  {
1284
+ "epoch": 2.78,
1285
+ "learning_rate": 5.3756218905472645e-05,
1286
+ "loss": 0.0181,
1287
  "step": 1860
1288
  },
1289
  {
1290
+ "epoch": 2.79,
1291
+ "learning_rate": 5.350746268656717e-05,
1292
+ "loss": 0.0129,
1293
  "step": 1870
1294
  },
1295
  {
1296
+ "epoch": 2.81,
1297
+ "learning_rate": 5.325870646766169e-05,
1298
+ "loss": 0.0096,
1299
  "step": 1880
1300
  },
1301
  {
1302
+ "epoch": 2.82,
1303
+ "learning_rate": 5.300995024875622e-05,
1304
+ "loss": 0.0215,
1305
  "step": 1890
1306
  },
1307
  {
1308
+ "epoch": 2.84,
1309
+ "learning_rate": 5.2761194029850745e-05,
1310
+ "loss": 0.0068,
1311
  "step": 1900
1312
  },
1313
  {
1314
+ "epoch": 2.84,
1315
+ "eval_accuracy": 0.9680940386230059,
1316
+ "eval_loss": 0.12825354933738708,
1317
+ "eval_runtime": 20.818,
1318
+ "eval_samples_per_second": 57.21,
1319
+ "eval_steps_per_second": 7.157,
1320
  "step": 1900
1321
  },
1322
  {
1323
+ "epoch": 2.85,
1324
+ "learning_rate": 5.251243781094528e-05,
1325
+ "loss": 0.0513,
1326
  "step": 1910
1327
  },
1328
  {
1329
+ "epoch": 2.87,
1330
+ "learning_rate": 5.2263681592039805e-05,
1331
+ "loss": 0.0949,
1332
  "step": 1920
1333
  },
1334
  {
1335
+ "epoch": 2.88,
1336
+ "learning_rate": 5.201492537313433e-05,
1337
+ "loss": 0.0435,
1338
  "step": 1930
1339
  },
1340
  {
1341
+ "epoch": 2.9,
1342
+ "learning_rate": 5.176616915422886e-05,
1343
+ "loss": 0.0382,
1344
  "step": 1940
1345
  },
1346
  {
1347
+ "epoch": 2.91,
1348
+ "learning_rate": 5.151741293532338e-05,
1349
+ "loss": 0.0579,
1350
  "step": 1950
1351
  },
1352
  {
1353
+ "epoch": 2.93,
1354
+ "learning_rate": 5.126865671641792e-05,
1355
+ "loss": 0.0151,
1356
  "step": 1960
1357
  },
1358
  {
1359
+ "epoch": 2.94,
1360
+ "learning_rate": 5.101990049751244e-05,
1361
+ "loss": 0.051,
1362
  "step": 1970
1363
  },
1364
  {
1365
+ "epoch": 2.96,
1366
+ "learning_rate": 5.0771144278606964e-05,
1367
+ "loss": 0.0474,
1368
  "step": 1980
1369
  },
1370
  {
1371
+ "epoch": 2.97,
1372
+ "learning_rate": 5.05223880597015e-05,
1373
+ "loss": 0.0227,
1374
  "step": 1990
1375
  },
1376
  {
1377
+ "epoch": 2.99,
1378
+ "learning_rate": 5.027363184079602e-05,
1379
+ "loss": 0.0319,
1380
  "step": 2000
1381
  },
1382
  {
1383
+ "epoch": 2.99,
1384
+ "eval_accuracy": 0.9865659109991604,
1385
+ "eval_loss": 0.0589967742562294,
1386
+ "eval_runtime": 19.1134,
1387
+ "eval_samples_per_second": 62.312,
1388
+ "eval_steps_per_second": 7.796,
1389
  "step": 2000
1390
  },
1391
  {
1392
+ "epoch": 3.0,
1393
+ "learning_rate": 5.0024875621890554e-05,
1394
+ "loss": 0.013,
1395
  "step": 2010
1396
  },
1397
  {
1398
+ "epoch": 3.01,
1399
+ "learning_rate": 4.977611940298508e-05,
1400
+ "loss": 0.0053,
1401
  "step": 2020
1402
  },
1403
  {
1404
+ "epoch": 3.03,
1405
+ "learning_rate": 4.952736318407961e-05,
1406
+ "loss": 0.036,
1407
  "step": 2030
1408
  },
1409
  {
1410
+ "epoch": 3.04,
1411
+ "learning_rate": 4.927860696517414e-05,
1412
+ "loss": 0.005,
1413
  "step": 2040
1414
  },
1415
  {
1416
+ "epoch": 3.06,
1417
+ "learning_rate": 4.902985074626866e-05,
1418
+ "loss": 0.0061,
1419
  "step": 2050
1420
  },
1421
  {
1422
+ "epoch": 3.07,
1423
+ "learning_rate": 4.8781094527363184e-05,
1424
+ "loss": 0.052,
1425
  "step": 2060
1426
  },
1427
  {
1428
+ "epoch": 3.09,
1429
+ "learning_rate": 4.8532338308457714e-05,
1430
+ "loss": 0.0093,
1431
  "step": 2070
1432
  },
1433
  {
1434
+ "epoch": 3.1,
1435
+ "learning_rate": 4.8283582089552244e-05,
1436
+ "loss": 0.0055,
1437
  "step": 2080
1438
  },
1439
  {
1440
+ "epoch": 3.12,
1441
+ "learning_rate": 4.803482587064677e-05,
1442
+ "loss": 0.0194,
1443
  "step": 2090
1444
  },
1445
  {
1446
+ "epoch": 3.13,
1447
+ "learning_rate": 4.77860696517413e-05,
1448
+ "loss": 0.0054,
1449
  "step": 2100
1450
  },
1451
  {
1452
+ "epoch": 3.13,
1453
+ "eval_accuracy": 0.982367758186398,
1454
+ "eval_loss": 0.07611096650362015,
1455
+ "eval_runtime": 19.2367,
1456
+ "eval_samples_per_second": 61.913,
1457
+ "eval_steps_per_second": 7.746,
1458
  "step": 2100
1459
  },
1460
  {
1461
+ "epoch": 3.15,
1462
+ "learning_rate": 4.753731343283582e-05,
1463
+ "loss": 0.0433,
1464
  "step": 2110
1465
  },
1466
  {
1467
+ "epoch": 3.16,
1468
+ "learning_rate": 4.728855721393035e-05,
1469
+ "loss": 0.0047,
1470
  "step": 2120
1471
  },
1472
  {
1473
+ "epoch": 3.18,
1474
+ "learning_rate": 4.703980099502488e-05,
1475
+ "loss": 0.005,
1476
  "step": 2130
1477
  },
1478
  {
1479
+ "epoch": 3.19,
1480
+ "learning_rate": 4.67910447761194e-05,
1481
+ "loss": 0.0087,
1482
  "step": 2140
1483
  },
1484
  {
1485
+ "epoch": 3.21,
1486
+ "learning_rate": 4.654228855721393e-05,
1487
+ "loss": 0.0045,
1488
  "step": 2150
1489
  },
1490
  {
1491
+ "epoch": 3.22,
1492
+ "learning_rate": 4.6293532338308456e-05,
1493
+ "loss": 0.0229,
1494
  "step": 2160
1495
  },
1496
  {
1497
+ "epoch": 3.24,
1498
+ "learning_rate": 4.6044776119402986e-05,
1499
+ "loss": 0.0043,
1500
  "step": 2170
1501
  },
1502
  {
1503
+ "epoch": 3.25,
1504
+ "learning_rate": 4.5796019900497516e-05,
1505
+ "loss": 0.006,
1506
  "step": 2180
1507
  },
1508
  {
1509
+ "epoch": 3.27,
1510
+ "learning_rate": 4.554726368159204e-05,
1511
+ "loss": 0.0361,
1512
  "step": 2190
1513
  },
1514
  {
1515
+ "epoch": 3.28,
1516
+ "learning_rate": 4.529850746268657e-05,
1517
+ "loss": 0.0136,
1518
  "step": 2200
1519
  },
1520
  {
1521
+ "epoch": 3.28,
1522
+ "eval_accuracy": 0.982367758186398,
1523
+ "eval_loss": 0.07589124888181686,
1524
+ "eval_runtime": 18.4299,
1525
+ "eval_samples_per_second": 64.623,
1526
+ "eval_steps_per_second": 8.085,
1527
  "step": 2200
1528
  },
1529
  {
1530
+ "epoch": 3.3,
1531
+ "learning_rate": 4.50497512437811e-05,
1532
+ "loss": 0.0046,
1533
  "step": 2210
1534
  },
1535
  {
1536
+ "epoch": 3.31,
1537
+ "learning_rate": 4.480099502487562e-05,
1538
+ "loss": 0.0136,
1539
  "step": 2220
1540
  },
1541
  {
1542
+ "epoch": 3.33,
1543
+ "learning_rate": 4.455223880597015e-05,
1544
+ "loss": 0.0042,
1545
  "step": 2230
1546
  },
1547
  {
1548
+ "epoch": 3.34,
1549
+ "learning_rate": 4.4303482587064676e-05,
1550
+ "loss": 0.0344,
1551
  "step": 2240
1552
  },
1553
  {
1554
+ "epoch": 3.36,
1555
+ "learning_rate": 4.4054726368159206e-05,
1556
+ "loss": 0.0112,
1557
  "step": 2250
1558
  },
1559
  {
1560
+ "epoch": 3.37,
1561
+ "learning_rate": 4.3805970149253736e-05,
1562
+ "loss": 0.0401,
1563
  "step": 2260
1564
  },
1565
  {
1566
+ "epoch": 3.39,
1567
+ "learning_rate": 4.355721393034826e-05,
1568
+ "loss": 0.0255,
1569
  "step": 2270
1570
  },
1571
  {
1572
+ "epoch": 3.4,
1573
+ "learning_rate": 4.330845771144279e-05,
1574
+ "loss": 0.0053,
1575
  "step": 2280
1576
  },
1577
  {
1578
+ "epoch": 3.42,
1579
+ "learning_rate": 4.305970149253731e-05,
1580
+ "loss": 0.0223,
1581
  "step": 2290
1582
  },
1583
  {
1584
+ "epoch": 3.43,
1585
+ "learning_rate": 4.281094527363184e-05,
1586
+ "loss": 0.0136,
1587
  "step": 2300
1588
  },
1589
  {
1590
+ "epoch": 3.43,
1591
+ "eval_accuracy": 0.9865659109991604,
1592
+ "eval_loss": 0.04480349272489548,
1593
+ "eval_runtime": 19.7958,
1594
+ "eval_samples_per_second": 60.164,
1595
+ "eval_steps_per_second": 7.527,
1596
  "step": 2300
1597
  },
1598
  {
1599
+ "epoch": 3.45,
1600
+ "learning_rate": 4.256218905472637e-05,
1601
+ "loss": 0.0552,
1602
  "step": 2310
1603
  },
1604
  {
1605
+ "epoch": 3.46,
1606
+ "learning_rate": 4.2313432835820895e-05,
1607
+ "loss": 0.0041,
1608
  "step": 2320
1609
  },
1610
  {
1611
+ "epoch": 3.48,
1612
+ "learning_rate": 4.2064676616915425e-05,
1613
+ "loss": 0.004,
1614
  "step": 2330
1615
  },
1616
  {
1617
+ "epoch": 3.49,
1618
+ "learning_rate": 4.181592039800995e-05,
1619
+ "loss": 0.0041,
1620
  "step": 2340
1621
  },
1622
  {
1623
+ "epoch": 3.51,
1624
+ "learning_rate": 4.156716417910448e-05,
1625
+ "loss": 0.012,
1626
  "step": 2350
1627
  },
1628
  {
1629
+ "epoch": 3.52,
1630
+ "learning_rate": 4.131840796019901e-05,
1631
+ "loss": 0.0039,
1632
  "step": 2360
1633
  },
1634
  {
1635
+ "epoch": 3.54,
1636
+ "learning_rate": 4.106965174129354e-05,
1637
+ "loss": 0.0052,
1638
  "step": 2370
1639
  },
1640
  {
1641
+ "epoch": 3.55,
1642
+ "learning_rate": 4.082089552238806e-05,
1643
+ "loss": 0.0049,
1644
  "step": 2380
1645
  },
1646
  {
1647
+ "epoch": 3.57,
1648
+ "learning_rate": 4.0572139303482585e-05,
1649
+ "loss": 0.0297,
1650
+ "step": 2390
1651
+ },
1652
+ {
1653
+ "epoch": 3.58,
1654
+ "learning_rate": 4.0323383084577115e-05,
1655
+ "loss": 0.0086,
1656
+ "step": 2400
1657
+ },
1658
+ {
1659
+ "epoch": 3.58,
1660
+ "eval_accuracy": 0.9848866498740554,
1661
+ "eval_loss": 0.08157633990049362,
1662
+ "eval_runtime": 19.2031,
1663
+ "eval_samples_per_second": 62.021,
1664
+ "eval_steps_per_second": 7.759,
1665
+ "step": 2400
1666
+ },
1667
+ {
1668
+ "epoch": 3.6,
1669
+ "learning_rate": 4.0074626865671645e-05,
1670
+ "loss": 0.0201,
1671
+ "step": 2410
1672
+ },
1673
+ {
1674
+ "epoch": 3.61,
1675
+ "learning_rate": 3.9825870646766175e-05,
1676
+ "loss": 0.0043,
1677
+ "step": 2420
1678
+ },
1679
+ {
1680
+ "epoch": 3.63,
1681
+ "learning_rate": 3.95771144278607e-05,
1682
+ "loss": 0.0037,
1683
+ "step": 2430
1684
+ },
1685
+ {
1686
+ "epoch": 3.64,
1687
+ "learning_rate": 3.932835820895522e-05,
1688
+ "loss": 0.0038,
1689
+ "step": 2440
1690
+ },
1691
+ {
1692
+ "epoch": 3.66,
1693
+ "learning_rate": 3.907960199004975e-05,
1694
+ "loss": 0.0698,
1695
+ "step": 2450
1696
+ },
1697
+ {
1698
+ "epoch": 3.67,
1699
+ "learning_rate": 3.883084577114428e-05,
1700
+ "loss": 0.0037,
1701
+ "step": 2460
1702
+ },
1703
+ {
1704
+ "epoch": 3.69,
1705
+ "learning_rate": 3.858208955223881e-05,
1706
+ "loss": 0.0039,
1707
+ "step": 2470
1708
+ },
1709
+ {
1710
+ "epoch": 3.7,
1711
+ "learning_rate": 3.8333333333333334e-05,
1712
+ "loss": 0.0267,
1713
+ "step": 2480
1714
+ },
1715
+ {
1716
+ "epoch": 3.72,
1717
+ "learning_rate": 3.808457711442786e-05,
1718
+ "loss": 0.0035,
1719
+ "step": 2490
1720
+ },
1721
+ {
1722
+ "epoch": 3.73,
1723
+ "learning_rate": 3.783582089552239e-05,
1724
+ "loss": 0.0212,
1725
+ "step": 2500
1726
+ },
1727
+ {
1728
+ "epoch": 3.73,
1729
+ "eval_accuracy": 0.9848866498740554,
1730
+ "eval_loss": 0.057428497821092606,
1731
+ "eval_runtime": 19.0519,
1732
+ "eval_samples_per_second": 62.513,
1733
+ "eval_steps_per_second": 7.821,
1734
+ "step": 2500
1735
+ },
1736
+ {
1737
+ "epoch": 3.75,
1738
+ "learning_rate": 3.758706467661692e-05,
1739
+ "loss": 0.0045,
1740
+ "step": 2510
1741
+ },
1742
+ {
1743
+ "epoch": 3.76,
1744
+ "learning_rate": 3.733830845771145e-05,
1745
+ "loss": 0.0036,
1746
+ "step": 2520
1747
+ },
1748
+ {
1749
+ "epoch": 3.78,
1750
+ "learning_rate": 3.708955223880598e-05,
1751
+ "loss": 0.0033,
1752
+ "step": 2530
1753
+ },
1754
+ {
1755
+ "epoch": 3.79,
1756
+ "learning_rate": 3.68407960199005e-05,
1757
+ "loss": 0.0033,
1758
+ "step": 2540
1759
+ },
1760
+ {
1761
+ "epoch": 3.81,
1762
+ "learning_rate": 3.6592039800995024e-05,
1763
+ "loss": 0.0033,
1764
+ "step": 2550
1765
+ },
1766
+ {
1767
+ "epoch": 3.82,
1768
+ "learning_rate": 3.6343283582089554e-05,
1769
+ "loss": 0.0033,
1770
+ "step": 2560
1771
+ },
1772
+ {
1773
+ "epoch": 3.84,
1774
+ "learning_rate": 3.6094527363184084e-05,
1775
+ "loss": 0.0034,
1776
+ "step": 2570
1777
+ },
1778
+ {
1779
+ "epoch": 3.85,
1780
+ "learning_rate": 3.584577114427861e-05,
1781
+ "loss": 0.0031,
1782
+ "step": 2580
1783
+ },
1784
+ {
1785
+ "epoch": 3.87,
1786
+ "learning_rate": 3.559701492537314e-05,
1787
+ "loss": 0.0034,
1788
+ "step": 2590
1789
+ },
1790
+ {
1791
+ "epoch": 3.88,
1792
+ "learning_rate": 3.534825870646766e-05,
1793
+ "loss": 0.0392,
1794
+ "step": 2600
1795
+ },
1796
+ {
1797
+ "epoch": 3.88,
1798
+ "eval_accuracy": 0.982367758186398,
1799
+ "eval_loss": 0.07654974609613419,
1800
+ "eval_runtime": 20.1007,
1801
+ "eval_samples_per_second": 59.252,
1802
+ "eval_steps_per_second": 7.413,
1803
+ "step": 2600
1804
+ },
1805
+ {
1806
+ "epoch": 3.9,
1807
+ "learning_rate": 3.509950248756219e-05,
1808
+ "loss": 0.0307,
1809
+ "step": 2610
1810
+ },
1811
+ {
1812
+ "epoch": 3.91,
1813
+ "learning_rate": 3.485074626865672e-05,
1814
+ "loss": 0.0075,
1815
+ "step": 2620
1816
+ },
1817
+ {
1818
+ "epoch": 3.93,
1819
+ "learning_rate": 3.4601990049751244e-05,
1820
+ "loss": 0.0325,
1821
+ "step": 2630
1822
+ },
1823
+ {
1824
+ "epoch": 3.94,
1825
+ "learning_rate": 3.4353233830845774e-05,
1826
+ "loss": 0.0032,
1827
+ "step": 2640
1828
+ },
1829
+ {
1830
+ "epoch": 3.96,
1831
+ "learning_rate": 3.41044776119403e-05,
1832
+ "loss": 0.0031,
1833
+ "step": 2650
1834
+ },
1835
+ {
1836
+ "epoch": 3.97,
1837
+ "learning_rate": 3.385572139303483e-05,
1838
+ "loss": 0.0151,
1839
+ "step": 2660
1840
+ },
1841
+ {
1842
+ "epoch": 3.99,
1843
+ "learning_rate": 3.360696517412936e-05,
1844
+ "loss": 0.0048,
1845
+ "step": 2670
1846
+ },
1847
+ {
1848
+ "epoch": 4.0,
1849
+ "learning_rate": 3.335820895522388e-05,
1850
+ "loss": 0.003,
1851
+ "step": 2680
1852
+ },
1853
+ {
1854
+ "epoch": 4.01,
1855
+ "learning_rate": 3.310945273631841e-05,
1856
+ "loss": 0.003,
1857
+ "step": 2690
1858
+ },
1859
+ {
1860
+ "epoch": 4.03,
1861
+ "learning_rate": 3.286069651741294e-05,
1862
+ "loss": 0.0029,
1863
+ "step": 2700
1864
+ },
1865
+ {
1866
+ "epoch": 4.03,
1867
+ "eval_accuracy": 0.979009235936188,
1868
+ "eval_loss": 0.073957659304142,
1869
+ "eval_runtime": 19.9521,
1870
+ "eval_samples_per_second": 59.693,
1871
+ "eval_steps_per_second": 7.468,
1872
+ "step": 2700
1873
+ },
1874
+ {
1875
+ "epoch": 4.04,
1876
+ "learning_rate": 3.261194029850746e-05,
1877
+ "loss": 0.0036,
1878
+ "step": 2710
1879
+ },
1880
+ {
1881
+ "epoch": 4.06,
1882
+ "learning_rate": 3.236318407960199e-05,
1883
+ "loss": 0.0039,
1884
+ "step": 2720
1885
+ },
1886
+ {
1887
+ "epoch": 4.07,
1888
+ "learning_rate": 3.2114427860696516e-05,
1889
+ "loss": 0.0026,
1890
+ "step": 2730
1891
+ },
1892
+ {
1893
+ "epoch": 4.09,
1894
+ "learning_rate": 3.1865671641791046e-05,
1895
+ "loss": 0.0031,
1896
+ "step": 2740
1897
+ },
1898
+ {
1899
+ "epoch": 4.1,
1900
+ "learning_rate": 3.1616915422885576e-05,
1901
+ "loss": 0.0031,
1902
+ "step": 2750
1903
+ },
1904
+ {
1905
+ "epoch": 4.12,
1906
+ "learning_rate": 3.13681592039801e-05,
1907
+ "loss": 0.0027,
1908
+ "step": 2760
1909
+ },
1910
+ {
1911
+ "epoch": 4.13,
1912
+ "learning_rate": 3.111940298507463e-05,
1913
+ "loss": 0.0029,
1914
+ "step": 2770
1915
+ },
1916
+ {
1917
+ "epoch": 4.15,
1918
+ "learning_rate": 3.087064676616915e-05,
1919
+ "loss": 0.0029,
1920
+ "step": 2780
1921
+ },
1922
+ {
1923
+ "epoch": 4.16,
1924
+ "learning_rate": 3.062189054726368e-05,
1925
+ "loss": 0.0028,
1926
+ "step": 2790
1927
+ },
1928
+ {
1929
+ "epoch": 4.18,
1930
+ "learning_rate": 3.037313432835821e-05,
1931
+ "loss": 0.0521,
1932
+ "step": 2800
1933
+ },
1934
+ {
1935
+ "epoch": 4.18,
1936
+ "eval_accuracy": 0.9924433249370277,
1937
+ "eval_loss": 0.03492419794201851,
1938
+ "eval_runtime": 20.5925,
1939
+ "eval_samples_per_second": 57.837,
1940
+ "eval_steps_per_second": 7.236,
1941
+ "step": 2800
1942
+ },
1943
+ {
1944
+ "epoch": 4.19,
1945
+ "learning_rate": 3.012437810945274e-05,
1946
+ "loss": 0.0029,
1947
+ "step": 2810
1948
+ },
1949
+ {
1950
+ "epoch": 4.21,
1951
+ "learning_rate": 2.9875621890547266e-05,
1952
+ "loss": 0.0029,
1953
+ "step": 2820
1954
+ },
1955
+ {
1956
+ "epoch": 4.22,
1957
+ "learning_rate": 2.962686567164179e-05,
1958
+ "loss": 0.0027,
1959
+ "step": 2830
1960
+ },
1961
+ {
1962
+ "epoch": 4.24,
1963
+ "learning_rate": 2.937810945273632e-05,
1964
+ "loss": 0.0029,
1965
+ "step": 2840
1966
+ },
1967
+ {
1968
+ "epoch": 4.25,
1969
+ "learning_rate": 2.912935323383085e-05,
1970
+ "loss": 0.0026,
1971
+ "step": 2850
1972
+ },
1973
+ {
1974
+ "epoch": 4.27,
1975
+ "learning_rate": 2.8880597014925376e-05,
1976
+ "loss": 0.004,
1977
+ "step": 2860
1978
+ },
1979
+ {
1980
+ "epoch": 4.28,
1981
+ "learning_rate": 2.8631840796019905e-05,
1982
+ "loss": 0.0027,
1983
+ "step": 2870
1984
+ },
1985
+ {
1986
+ "epoch": 4.3,
1987
+ "learning_rate": 2.838308457711443e-05,
1988
+ "loss": 0.0028,
1989
+ "step": 2880
1990
+ },
1991
+ {
1992
+ "epoch": 4.31,
1993
+ "learning_rate": 2.8134328358208955e-05,
1994
+ "loss": 0.0029,
1995
+ "step": 2890
1996
+ },
1997
+ {
1998
+ "epoch": 4.33,
1999
+ "learning_rate": 2.7885572139303485e-05,
2000
+ "loss": 0.0025,
2001
+ "step": 2900
2002
+ },
2003
+ {
2004
+ "epoch": 4.33,
2005
+ "eval_accuracy": 0.9865659109991604,
2006
+ "eval_loss": 0.06394174695014954,
2007
+ "eval_runtime": 19.857,
2008
+ "eval_samples_per_second": 59.979,
2009
+ "eval_steps_per_second": 7.504,
2010
+ "step": 2900
2011
+ },
2012
+ {
2013
+ "epoch": 4.34,
2014
+ "learning_rate": 2.7636815920398012e-05,
2015
+ "loss": 0.0028,
2016
+ "step": 2910
2017
+ },
2018
+ {
2019
+ "epoch": 4.36,
2020
+ "learning_rate": 2.7388059701492542e-05,
2021
+ "loss": 0.0027,
2022
+ "step": 2920
2023
+ },
2024
+ {
2025
+ "epoch": 4.37,
2026
+ "learning_rate": 2.7139303482587065e-05,
2027
+ "loss": 0.0026,
2028
+ "step": 2930
2029
+ },
2030
+ {
2031
+ "epoch": 4.39,
2032
+ "learning_rate": 2.689054726368159e-05,
2033
+ "loss": 0.0026,
2034
+ "step": 2940
2035
+ },
2036
+ {
2037
+ "epoch": 4.4,
2038
+ "learning_rate": 2.664179104477612e-05,
2039
+ "loss": 0.0025,
2040
+ "step": 2950
2041
+ },
2042
+ {
2043
+ "epoch": 4.42,
2044
+ "learning_rate": 2.6393034825870648e-05,
2045
+ "loss": 0.0025,
2046
+ "step": 2960
2047
+ },
2048
+ {
2049
+ "epoch": 4.43,
2050
+ "learning_rate": 2.6144278606965178e-05,
2051
+ "loss": 0.0027,
2052
+ "step": 2970
2053
+ },
2054
+ {
2055
+ "epoch": 4.45,
2056
+ "learning_rate": 2.58955223880597e-05,
2057
+ "loss": 0.0026,
2058
+ "step": 2980
2059
+ },
2060
+ {
2061
+ "epoch": 4.46,
2062
+ "learning_rate": 2.5646766169154228e-05,
2063
+ "loss": 0.0025,
2064
+ "step": 2990
2065
+ },
2066
+ {
2067
+ "epoch": 4.48,
2068
+ "learning_rate": 2.5398009950248758e-05,
2069
+ "loss": 0.0024,
2070
+ "step": 3000
2071
+ },
2072
+ {
2073
+ "epoch": 4.48,
2074
+ "eval_accuracy": 0.9882451721242653,
2075
+ "eval_loss": 0.050867918878793716,
2076
+ "eval_runtime": 19.0308,
2077
+ "eval_samples_per_second": 62.583,
2078
+ "eval_steps_per_second": 7.829,
2079
+ "step": 3000
2080
+ },
2081
+ {
2082
+ "epoch": 4.49,
2083
+ "learning_rate": 2.5149253731343288e-05,
2084
+ "loss": 0.0033,
2085
+ "step": 3010
2086
+ },
2087
+ {
2088
+ "epoch": 4.51,
2089
+ "learning_rate": 2.490049751243781e-05,
2090
+ "loss": 0.0026,
2091
+ "step": 3020
2092
+ },
2093
+ {
2094
+ "epoch": 4.52,
2095
+ "learning_rate": 2.465174129353234e-05,
2096
+ "loss": 0.0026,
2097
+ "step": 3030
2098
+ },
2099
+ {
2100
+ "epoch": 4.54,
2101
+ "learning_rate": 2.4402985074626868e-05,
2102
+ "loss": 0.0025,
2103
+ "step": 3040
2104
+ },
2105
+ {
2106
+ "epoch": 4.55,
2107
+ "learning_rate": 2.4154228855721394e-05,
2108
+ "loss": 0.0024,
2109
+ "step": 3050
2110
+ },
2111
+ {
2112
+ "epoch": 4.57,
2113
+ "learning_rate": 2.390547263681592e-05,
2114
+ "loss": 0.0025,
2115
+ "step": 3060
2116
+ },
2117
+ {
2118
+ "epoch": 4.58,
2119
+ "learning_rate": 2.3656716417910448e-05,
2120
+ "loss": 0.0025,
2121
+ "step": 3070
2122
+ },
2123
+ {
2124
+ "epoch": 4.6,
2125
+ "learning_rate": 2.3407960199004977e-05,
2126
+ "loss": 0.0361,
2127
+ "step": 3080
2128
+ },
2129
+ {
2130
+ "epoch": 4.61,
2131
+ "learning_rate": 2.3159203980099504e-05,
2132
+ "loss": 0.0024,
2133
+ "step": 3090
2134
+ },
2135
+ {
2136
+ "epoch": 4.63,
2137
+ "learning_rate": 2.291044776119403e-05,
2138
+ "loss": 0.0024,
2139
+ "step": 3100
2140
+ },
2141
+ {
2142
+ "epoch": 4.63,
2143
+ "eval_accuracy": 0.9899244332493703,
2144
+ "eval_loss": 0.0467989481985569,
2145
+ "eval_runtime": 19.003,
2146
+ "eval_samples_per_second": 62.674,
2147
+ "eval_steps_per_second": 7.841,
2148
+ "step": 3100
2149
+ },
2150
+ {
2151
+ "epoch": 4.64,
2152
+ "learning_rate": 2.2661691542288557e-05,
2153
+ "loss": 0.0037,
2154
+ "step": 3110
2155
+ },
2156
+ {
2157
+ "epoch": 4.66,
2158
+ "learning_rate": 2.2412935323383087e-05,
2159
+ "loss": 0.0125,
2160
+ "step": 3120
2161
+ },
2162
+ {
2163
+ "epoch": 4.67,
2164
+ "learning_rate": 2.2164179104477614e-05,
2165
+ "loss": 0.0025,
2166
+ "step": 3130
2167
+ },
2168
+ {
2169
+ "epoch": 4.69,
2170
+ "learning_rate": 2.191542288557214e-05,
2171
+ "loss": 0.0026,
2172
+ "step": 3140
2173
+ },
2174
+ {
2175
+ "epoch": 4.7,
2176
+ "learning_rate": 2.1666666666666667e-05,
2177
+ "loss": 0.0026,
2178
+ "step": 3150
2179
+ },
2180
+ {
2181
+ "epoch": 4.72,
2182
+ "learning_rate": 2.1417910447761194e-05,
2183
+ "loss": 0.0023,
2184
+ "step": 3160
2185
+ },
2186
+ {
2187
+ "epoch": 4.73,
2188
+ "learning_rate": 2.1169154228855724e-05,
2189
+ "loss": 0.0022,
2190
+ "step": 3170
2191
+ },
2192
+ {
2193
+ "epoch": 4.75,
2194
+ "learning_rate": 2.092039800995025e-05,
2195
+ "loss": 0.0024,
2196
+ "step": 3180
2197
+ },
2198
+ {
2199
+ "epoch": 4.76,
2200
+ "learning_rate": 2.0671641791044777e-05,
2201
+ "loss": 0.0023,
2202
+ "step": 3190
2203
+ },
2204
+ {
2205
+ "epoch": 4.78,
2206
+ "learning_rate": 2.0422885572139307e-05,
2207
+ "loss": 0.0024,
2208
+ "step": 3200
2209
+ },
2210
+ {
2211
+ "epoch": 4.78,
2212
+ "eval_accuracy": 0.9882451721242653,
2213
+ "eval_loss": 0.052428748458623886,
2214
+ "eval_runtime": 20.9205,
2215
+ "eval_samples_per_second": 56.93,
2216
+ "eval_steps_per_second": 7.122,
2217
+ "step": 3200
2218
+ },
2219
+ {
2220
+ "epoch": 4.79,
2221
+ "learning_rate": 2.017412935323383e-05,
2222
+ "loss": 0.0132,
2223
+ "step": 3210
2224
+ },
2225
+ {
2226
+ "epoch": 4.81,
2227
+ "learning_rate": 1.992537313432836e-05,
2228
+ "loss": 0.0023,
2229
+ "step": 3220
2230
+ },
2231
+ {
2232
+ "epoch": 4.82,
2233
+ "learning_rate": 1.9676616915422887e-05,
2234
+ "loss": 0.0024,
2235
+ "step": 3230
2236
+ },
2237
+ {
2238
+ "epoch": 4.84,
2239
+ "learning_rate": 1.9427860696517413e-05,
2240
+ "loss": 0.0023,
2241
+ "step": 3240
2242
+ },
2243
+ {
2244
+ "epoch": 4.85,
2245
+ "learning_rate": 1.9179104477611943e-05,
2246
+ "loss": 0.0022,
2247
+ "step": 3250
2248
+ },
2249
+ {
2250
+ "epoch": 4.87,
2251
+ "learning_rate": 1.8930348258706466e-05,
2252
+ "loss": 0.0025,
2253
+ "step": 3260
2254
+ },
2255
+ {
2256
+ "epoch": 4.88,
2257
+ "learning_rate": 1.8681592039800996e-05,
2258
+ "loss": 0.0028,
2259
+ "step": 3270
2260
+ },
2261
+ {
2262
+ "epoch": 4.9,
2263
+ "learning_rate": 1.8432835820895523e-05,
2264
+ "loss": 0.0022,
2265
+ "step": 3280
2266
+ },
2267
+ {
2268
+ "epoch": 4.91,
2269
+ "learning_rate": 1.818407960199005e-05,
2270
+ "loss": 0.0021,
2271
+ "step": 3290
2272
+ },
2273
+ {
2274
+ "epoch": 4.93,
2275
+ "learning_rate": 1.793532338308458e-05,
2276
+ "loss": 0.0022,
2277
+ "step": 3300
2278
+ },
2279
+ {
2280
+ "epoch": 4.93,
2281
+ "eval_accuracy": 0.9899244332493703,
2282
+ "eval_loss": 0.0504080168902874,
2283
+ "eval_runtime": 19.0283,
2284
+ "eval_samples_per_second": 62.591,
2285
+ "eval_steps_per_second": 7.83,
2286
+ "step": 3300
2287
+ },
2288
+ {
2289
+ "epoch": 4.94,
2290
+ "learning_rate": 1.7686567164179106e-05,
2291
+ "loss": 0.0021,
2292
+ "step": 3310
2293
+ },
2294
+ {
2295
+ "epoch": 4.96,
2296
+ "learning_rate": 1.7437810945273633e-05,
2297
+ "loss": 0.0023,
2298
+ "step": 3320
2299
+ },
2300
+ {
2301
+ "epoch": 4.97,
2302
+ "learning_rate": 1.718905472636816e-05,
2303
+ "loss": 0.0022,
2304
+ "step": 3330
2305
+ },
2306
+ {
2307
+ "epoch": 4.99,
2308
+ "learning_rate": 1.6940298507462686e-05,
2309
+ "loss": 0.0021,
2310
+ "step": 3340
2311
+ },
2312
+ {
2313
+ "epoch": 5.0,
2314
+ "learning_rate": 1.6691542288557216e-05,
2315
+ "loss": 0.002,
2316
+ "step": 3350
2317
+ },
2318
+ {
2319
+ "epoch": 5.01,
2320
+ "learning_rate": 1.6442786069651742e-05,
2321
+ "loss": 0.0021,
2322
+ "step": 3360
2323
+ },
2324
+ {
2325
+ "epoch": 5.03,
2326
+ "learning_rate": 1.619402985074627e-05,
2327
+ "loss": 0.0021,
2328
+ "step": 3370
2329
+ },
2330
+ {
2331
+ "epoch": 5.04,
2332
+ "learning_rate": 1.5945273631840796e-05,
2333
+ "loss": 0.0021,
2334
+ "step": 3380
2335
+ },
2336
+ {
2337
+ "epoch": 5.06,
2338
+ "learning_rate": 1.5696517412935326e-05,
2339
+ "loss": 0.0022,
2340
+ "step": 3390
2341
+ },
2342
+ {
2343
+ "epoch": 5.07,
2344
+ "learning_rate": 1.5447761194029852e-05,
2345
+ "loss": 0.0022,
2346
+ "step": 3400
2347
+ },
2348
+ {
2349
+ "epoch": 5.07,
2350
+ "eval_accuracy": 0.9890848026868178,
2351
+ "eval_loss": 0.048807818442583084,
2352
+ "eval_runtime": 20.0305,
2353
+ "eval_samples_per_second": 59.459,
2354
+ "eval_steps_per_second": 7.439,
2355
+ "step": 3400
2356
+ },
2357
+ {
2358
+ "epoch": 5.09,
2359
+ "learning_rate": 1.5199004975124379e-05,
2360
+ "loss": 0.002,
2361
+ "step": 3410
2362
+ },
2363
+ {
2364
+ "epoch": 5.1,
2365
+ "learning_rate": 1.4950248756218907e-05,
2366
+ "loss": 0.0021,
2367
+ "step": 3420
2368
+ },
2369
+ {
2370
+ "epoch": 5.12,
2371
+ "learning_rate": 1.4701492537313432e-05,
2372
+ "loss": 0.0023,
2373
+ "step": 3430
2374
+ },
2375
+ {
2376
+ "epoch": 5.13,
2377
+ "learning_rate": 1.4452736318407962e-05,
2378
+ "loss": 0.0022,
2379
+ "step": 3440
2380
+ },
2381
+ {
2382
+ "epoch": 5.15,
2383
+ "learning_rate": 1.4203980099502487e-05,
2384
+ "loss": 0.002,
2385
+ "step": 3450
2386
+ },
2387
+ {
2388
+ "epoch": 5.16,
2389
+ "learning_rate": 1.3955223880597015e-05,
2390
+ "loss": 0.002,
2391
+ "step": 3460
2392
+ },
2393
+ {
2394
+ "epoch": 5.18,
2395
+ "learning_rate": 1.3706467661691543e-05,
2396
+ "loss": 0.0022,
2397
+ "step": 3470
2398
+ },
2399
+ {
2400
+ "epoch": 5.19,
2401
+ "learning_rate": 1.345771144278607e-05,
2402
+ "loss": 0.0021,
2403
+ "step": 3480
2404
+ },
2405
+ {
2406
+ "epoch": 5.21,
2407
+ "learning_rate": 1.3208955223880598e-05,
2408
+ "loss": 0.0021,
2409
+ "step": 3490
2410
+ },
2411
+ {
2412
+ "epoch": 5.22,
2413
+ "learning_rate": 1.2960199004975123e-05,
2414
+ "loss": 0.0021,
2415
+ "step": 3500
2416
+ },
2417
+ {
2418
+ "epoch": 5.22,
2419
+ "eval_accuracy": 0.9899244332493703,
2420
+ "eval_loss": 0.04841174930334091,
2421
+ "eval_runtime": 19.9366,
2422
+ "eval_samples_per_second": 59.739,
2423
+ "eval_steps_per_second": 7.474,
2424
+ "step": 3500
2425
+ },
2426
+ {
2427
+ "epoch": 5.24,
2428
+ "learning_rate": 1.2711442786069651e-05,
2429
+ "loss": 0.002,
2430
+ "step": 3510
2431
+ },
2432
+ {
2433
+ "epoch": 5.25,
2434
+ "learning_rate": 1.246268656716418e-05,
2435
+ "loss": 0.002,
2436
+ "step": 3520
2437
+ },
2438
+ {
2439
+ "epoch": 5.27,
2440
+ "learning_rate": 1.2213930348258706e-05,
2441
+ "loss": 0.002,
2442
+ "step": 3530
2443
+ },
2444
+ {
2445
+ "epoch": 5.28,
2446
+ "learning_rate": 1.1965174129353235e-05,
2447
+ "loss": 0.002,
2448
+ "step": 3540
2449
+ },
2450
+ {
2451
+ "epoch": 5.3,
2452
+ "learning_rate": 1.1716417910447761e-05,
2453
+ "loss": 0.002,
2454
+ "step": 3550
2455
+ },
2456
+ {
2457
+ "epoch": 5.31,
2458
+ "learning_rate": 1.146766169154229e-05,
2459
+ "loss": 0.0021,
2460
+ "step": 3560
2461
+ },
2462
+ {
2463
+ "epoch": 5.33,
2464
+ "learning_rate": 1.1218905472636816e-05,
2465
+ "loss": 0.0021,
2466
+ "step": 3570
2467
+ },
2468
+ {
2469
+ "epoch": 5.34,
2470
+ "learning_rate": 1.0970149253731343e-05,
2471
+ "loss": 0.0021,
2472
+ "step": 3580
2473
+ },
2474
+ {
2475
+ "epoch": 5.36,
2476
+ "learning_rate": 1.0721393034825871e-05,
2477
+ "loss": 0.002,
2478
+ "step": 3590
2479
+ },
2480
+ {
2481
+ "epoch": 5.37,
2482
+ "learning_rate": 1.04726368159204e-05,
2483
+ "loss": 0.0019,
2484
+ "step": 3600
2485
+ },
2486
+ {
2487
+ "epoch": 5.37,
2488
+ "eval_accuracy": 0.9899244332493703,
2489
+ "eval_loss": 0.04795178398489952,
2490
+ "eval_runtime": 19.4137,
2491
+ "eval_samples_per_second": 61.348,
2492
+ "eval_steps_per_second": 7.675,
2493
+ "step": 3600
2494
+ },
2495
+ {
2496
+ "epoch": 5.39,
2497
+ "learning_rate": 1.0223880597014926e-05,
2498
+ "loss": 0.002,
2499
+ "step": 3610
2500
+ },
2501
+ {
2502
+ "epoch": 5.4,
2503
+ "learning_rate": 9.975124378109452e-06,
2504
+ "loss": 0.002,
2505
+ "step": 3620
2506
+ },
2507
+ {
2508
+ "epoch": 5.42,
2509
+ "learning_rate": 9.72636815920398e-06,
2510
+ "loss": 0.002,
2511
+ "step": 3630
2512
+ },
2513
+ {
2514
+ "epoch": 5.43,
2515
+ "learning_rate": 9.477611940298507e-06,
2516
+ "loss": 0.002,
2517
+ "step": 3640
2518
+ },
2519
+ {
2520
+ "epoch": 5.45,
2521
+ "learning_rate": 9.228855721393036e-06,
2522
+ "loss": 0.002,
2523
+ "step": 3650
2524
+ },
2525
+ {
2526
+ "epoch": 5.46,
2527
+ "learning_rate": 8.980099502487562e-06,
2528
+ "loss": 0.0019,
2529
+ "step": 3660
2530
+ },
2531
+ {
2532
+ "epoch": 5.48,
2533
+ "learning_rate": 8.73134328358209e-06,
2534
+ "loss": 0.0019,
2535
+ "step": 3670
2536
+ },
2537
+ {
2538
+ "epoch": 5.49,
2539
+ "learning_rate": 8.482587064676617e-06,
2540
+ "loss": 0.0019,
2541
+ "step": 3680
2542
+ },
2543
+ {
2544
+ "epoch": 5.51,
2545
+ "learning_rate": 8.233830845771144e-06,
2546
+ "loss": 0.002,
2547
+ "step": 3690
2548
+ },
2549
+ {
2550
+ "epoch": 5.52,
2551
+ "learning_rate": 7.985074626865672e-06,
2552
+ "loss": 0.0019,
2553
+ "step": 3700
2554
+ },
2555
+ {
2556
+ "epoch": 5.52,
2557
+ "eval_accuracy": 0.9899244332493703,
2558
+ "eval_loss": 0.04765713959932327,
2559
+ "eval_runtime": 19.0716,
2560
+ "eval_samples_per_second": 62.449,
2561
+ "eval_steps_per_second": 7.813,
2562
+ "step": 3700
2563
+ },
2564
+ {
2565
+ "epoch": 5.54,
2566
+ "learning_rate": 7.7363184079602e-06,
2567
+ "loss": 0.002,
2568
+ "step": 3710
2569
+ },
2570
+ {
2571
+ "epoch": 5.55,
2572
+ "learning_rate": 7.487562189054727e-06,
2573
+ "loss": 0.0019,
2574
+ "step": 3720
2575
+ },
2576
+ {
2577
+ "epoch": 5.57,
2578
+ "learning_rate": 7.238805970149254e-06,
2579
+ "loss": 0.002,
2580
+ "step": 3730
2581
+ },
2582
+ {
2583
+ "epoch": 5.58,
2584
+ "learning_rate": 6.990049751243781e-06,
2585
+ "loss": 0.0019,
2586
+ "step": 3740
2587
+ },
2588
+ {
2589
+ "epoch": 5.6,
2590
+ "learning_rate": 6.741293532338309e-06,
2591
+ "loss": 0.0019,
2592
+ "step": 3750
2593
+ },
2594
+ {
2595
+ "epoch": 5.61,
2596
+ "learning_rate": 6.492537313432837e-06,
2597
+ "loss": 0.002,
2598
+ "step": 3760
2599
+ },
2600
+ {
2601
+ "epoch": 5.63,
2602
+ "learning_rate": 6.243781094527364e-06,
2603
+ "loss": 0.002,
2604
+ "step": 3770
2605
+ },
2606
+ {
2607
+ "epoch": 5.64,
2608
+ "learning_rate": 5.995024875621891e-06,
2609
+ "loss": 0.002,
2610
+ "step": 3780
2611
+ },
2612
+ {
2613
+ "epoch": 5.66,
2614
+ "learning_rate": 5.746268656716418e-06,
2615
+ "loss": 0.002,
2616
+ "step": 3790
2617
+ },
2618
+ {
2619
+ "epoch": 5.67,
2620
+ "learning_rate": 5.4975124378109455e-06,
2621
+ "loss": 0.0019,
2622
+ "step": 3800
2623
+ },
2624
+ {
2625
+ "epoch": 5.67,
2626
+ "eval_accuracy": 0.9899244332493703,
2627
+ "eval_loss": 0.04748144745826721,
2628
+ "eval_runtime": 19.2405,
2629
+ "eval_samples_per_second": 61.901,
2630
+ "eval_steps_per_second": 7.744,
2631
+ "step": 3800
2632
+ },
2633
+ {
2634
+ "epoch": 5.69,
2635
+ "learning_rate": 5.248756218905473e-06,
2636
+ "loss": 0.002,
2637
+ "step": 3810
2638
+ },
2639
+ {
2640
+ "epoch": 5.7,
2641
+ "learning_rate": 5e-06,
2642
+ "loss": 0.0019,
2643
+ "step": 3820
2644
+ },
2645
+ {
2646
+ "epoch": 5.72,
2647
+ "learning_rate": 4.751243781094528e-06,
2648
+ "loss": 0.0019,
2649
+ "step": 3830
2650
+ },
2651
+ {
2652
+ "epoch": 5.73,
2653
+ "learning_rate": 4.5024875621890544e-06,
2654
+ "loss": 0.0019,
2655
+ "step": 3840
2656
+ },
2657
+ {
2658
+ "epoch": 5.75,
2659
+ "learning_rate": 4.253731343283583e-06,
2660
+ "loss": 0.0018,
2661
+ "step": 3850
2662
+ },
2663
+ {
2664
+ "epoch": 5.76,
2665
+ "learning_rate": 4.004975124378109e-06,
2666
+ "loss": 0.002,
2667
+ "step": 3860
2668
+ },
2669
+ {
2670
+ "epoch": 5.78,
2671
+ "learning_rate": 3.7562189054726368e-06,
2672
+ "loss": 0.0018,
2673
+ "step": 3870
2674
+ },
2675
+ {
2676
+ "epoch": 5.79,
2677
+ "learning_rate": 3.5074626865671646e-06,
2678
+ "loss": 0.0019,
2679
+ "step": 3880
2680
+ },
2681
+ {
2682
+ "epoch": 5.81,
2683
+ "learning_rate": 3.2587064676616916e-06,
2684
+ "loss": 0.0019,
2685
+ "step": 3890
2686
+ },
2687
+ {
2688
+ "epoch": 5.82,
2689
+ "learning_rate": 3.009950248756219e-06,
2690
+ "loss": 0.0018,
2691
+ "step": 3900
2692
+ },
2693
+ {
2694
+ "epoch": 5.82,
2695
+ "eval_accuracy": 0.9899244332493703,
2696
+ "eval_loss": 0.0473811998963356,
2697
+ "eval_runtime": 22.4733,
2698
+ "eval_samples_per_second": 52.996,
2699
+ "eval_steps_per_second": 6.63,
2700
+ "step": 3900
2701
+ },
2702
+ {
2703
+ "epoch": 5.84,
2704
+ "learning_rate": 2.7611940298507465e-06,
2705
+ "loss": 0.002,
2706
+ "step": 3910
2707
+ },
2708
+ {
2709
+ "epoch": 5.85,
2710
+ "learning_rate": 2.512437810945274e-06,
2711
+ "loss": 0.0018,
2712
+ "step": 3920
2713
+ },
2714
+ {
2715
+ "epoch": 5.87,
2716
+ "learning_rate": 2.263681592039801e-06,
2717
+ "loss": 0.0019,
2718
+ "step": 3930
2719
+ },
2720
+ {
2721
+ "epoch": 5.88,
2722
+ "learning_rate": 2.0149253731343284e-06,
2723
+ "loss": 0.0018,
2724
+ "step": 3940
2725
+ },
2726
+ {
2727
+ "epoch": 5.9,
2728
+ "learning_rate": 1.7661691542288559e-06,
2729
+ "loss": 0.002,
2730
+ "step": 3950
2731
+ },
2732
+ {
2733
+ "epoch": 5.91,
2734
+ "learning_rate": 1.517412935323383e-06,
2735
+ "loss": 0.0019,
2736
+ "step": 3960
2737
+ },
2738
+ {
2739
+ "epoch": 5.93,
2740
+ "learning_rate": 1.2686567164179105e-06,
2741
+ "loss": 0.0019,
2742
+ "step": 3970
2743
+ },
2744
+ {
2745
+ "epoch": 5.94,
2746
+ "learning_rate": 1.019900497512438e-06,
2747
+ "loss": 0.002,
2748
+ "step": 3980
2749
+ },
2750
+ {
2751
+ "epoch": 5.96,
2752
+ "learning_rate": 7.711442786069652e-07,
2753
+ "loss": 0.0019,
2754
+ "step": 3990
2755
+ },
2756
+ {
2757
+ "epoch": 5.97,
2758
+ "learning_rate": 5.223880597014926e-07,
2759
+ "loss": 0.0018,
2760
+ "step": 4000
2761
+ },
2762
+ {
2763
+ "epoch": 5.97,
2764
+ "eval_accuracy": 0.9899244332493703,
2765
+ "eval_loss": 0.047312360256910324,
2766
+ "eval_runtime": 19.2838,
2767
+ "eval_samples_per_second": 61.762,
2768
+ "eval_steps_per_second": 7.727,
2769
+ "step": 4000
2770
+ },
2771
+ {
2772
+ "epoch": 5.99,
2773
+ "learning_rate": 2.736318407960199e-07,
2774
+ "loss": 0.0019,
2775
+ "step": 4010
2776
+ },
2777
+ {
2778
+ "epoch": 6.0,
2779
+ "learning_rate": 2.4875621890547262e-08,
2780
+ "loss": 0.002,
2781
+ "step": 4020
2782
+ },
2783
+ {
2784
+ "epoch": 6.0,
2785
+ "step": 4020,
2786
+ "total_flos": 4.98440050928135e+18,
2787
+ "train_loss": 0.10442606876627426,
2788
+ "train_runtime": 2908.5673,
2789
+ "train_samples_per_second": 22.112,
2790
+ "train_steps_per_second": 1.382
2791
  }
2792
  ],
2793
  "logging_steps": 10,
2794
+ "max_steps": 4020,
2795
+ "num_train_epochs": 6,
2796
  "save_steps": 100,
2797
+ "total_flos": 4.98440050928135e+18,
2798
  "trial_name": null,
2799
  "trial_params": null
2800
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b12cc51442839717508742639a1c79fb55d325dbcabdf006967539e263b7feaf
3
  size 4027
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69c1848e7a2a1c741cb83dcd97efdf744bd596d264cd32a3ec9789f52cf1b61b
3
  size 4027