DazMashaly commited on
Commit
c1aacdd
1 Parent(s): a182762

Training in progress, step 100

Browse files
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 6.0,
3
- "total_flos": 4.98440050928135e+18,
4
- "train_loss": 0.10442606876627426,
5
- "train_runtime": 2908.5673,
6
- "train_samples_per_second": 22.112,
7
- "train_steps_per_second": 1.382
8
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "total_flos": 2.610159532500861e+18,
4
+ "train_loss": 0.006745032241080431,
5
+ "train_runtime": 1079.8985,
6
+ "train_samples_per_second": 8.823,
7
+ "train_steps_per_second": 0.552
8
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5f9c5ce673f8ff20299561259807b7fd55051c1543e7bb4513d20f541d2604d8
3
  size 1213401965
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3cb8b7ed8a22553748b214e3462a06e3a46ab6d01ce1b2716a44795582cac914
3
  size 1213401965
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 6.0,
3
- "total_flos": 4.98440050928135e+18,
4
- "train_loss": 0.10442606876627426,
5
- "train_runtime": 2908.5673,
6
- "train_samples_per_second": 22.112,
7
- "train_steps_per_second": 1.382
8
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "total_flos": 2.610159532500861e+18,
4
+ "train_loss": 0.006745032241080431,
5
+ "train_runtime": 1079.8985,
6
+ "train_samples_per_second": 8.823,
7
+ "train_steps_per_second": 0.552
8
  }
trainer_state.json CHANGED
@@ -1,2800 +1,427 @@
1
  {
2
- "best_metric": 0.03492419794201851,
3
- "best_model_checkpoint": "./vit-base-beans-demo-v5/checkpoint-2800",
4
- "epoch": 6.0,
5
  "eval_steps": 100,
6
- "global_step": 4020,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.01,
13
- "learning_rate": 9.975124378109453e-05,
14
- "loss": 2.5776,
15
  "step": 10
16
  },
17
  {
18
  "epoch": 0.03,
19
- "learning_rate": 9.950248756218906e-05,
20
- "loss": 2.217,
21
  "step": 20
22
  },
23
  {
24
- "epoch": 0.04,
25
- "learning_rate": 9.925373134328359e-05,
26
- "loss": 1.9279,
27
  "step": 30
28
  },
29
  {
30
- "epoch": 0.06,
31
- "learning_rate": 9.900497512437812e-05,
32
- "loss": 1.6945,
33
  "step": 40
34
  },
35
  {
36
- "epoch": 0.07,
37
- "learning_rate": 9.875621890547265e-05,
38
- "loss": 1.4931,
39
  "step": 50
40
  },
41
  {
42
- "epoch": 0.09,
43
- "learning_rate": 9.850746268656717e-05,
44
- "loss": 1.2644,
45
  "step": 60
46
  },
47
  {
48
- "epoch": 0.1,
49
- "learning_rate": 9.82587064676617e-05,
50
- "loss": 1.1095,
51
  "step": 70
52
  },
53
  {
54
- "epoch": 0.12,
55
- "learning_rate": 9.800995024875622e-05,
56
- "loss": 1.0658,
57
  "step": 80
58
  },
59
  {
60
- "epoch": 0.13,
61
- "learning_rate": 9.776119402985075e-05,
62
- "loss": 0.8875,
63
  "step": 90
64
  },
65
  {
66
- "epoch": 0.15,
67
- "learning_rate": 9.751243781094528e-05,
68
- "loss": 0.8306,
69
  "step": 100
70
  },
71
  {
72
- "epoch": 0.15,
73
- "eval_accuracy": 0.90848026868178,
74
- "eval_loss": 0.791517972946167,
75
- "eval_runtime": 20.4511,
76
- "eval_samples_per_second": 58.237,
77
- "eval_steps_per_second": 7.286,
78
  "step": 100
79
  },
80
  {
81
- "epoch": 0.16,
82
- "learning_rate": 9.72636815920398e-05,
83
- "loss": 0.7402,
84
  "step": 110
85
  },
86
  {
87
- "epoch": 0.18,
88
- "learning_rate": 9.701492537313434e-05,
89
- "loss": 0.7802,
90
  "step": 120
91
  },
92
  {
93
- "epoch": 0.19,
94
- "learning_rate": 9.676616915422886e-05,
95
- "loss": 0.6563,
96
  "step": 130
97
  },
98
  {
99
- "epoch": 0.21,
100
- "learning_rate": 9.65174129353234e-05,
101
- "loss": 0.5579,
102
  "step": 140
103
  },
104
  {
105
- "epoch": 0.22,
106
- "learning_rate": 9.626865671641792e-05,
107
- "loss": 0.6564,
108
  "step": 150
109
  },
110
  {
111
- "epoch": 0.24,
112
- "learning_rate": 9.601990049751244e-05,
113
- "loss": 0.5115,
114
  "step": 160
115
  },
116
  {
117
- "epoch": 0.25,
118
- "learning_rate": 9.577114427860697e-05,
119
- "loss": 0.5087,
120
  "step": 170
121
  },
122
  {
123
- "epoch": 0.27,
124
- "learning_rate": 9.552238805970149e-05,
125
- "loss": 0.4821,
126
  "step": 180
127
  },
128
  {
129
- "epoch": 0.28,
130
- "learning_rate": 9.527363184079603e-05,
131
- "loss": 0.3972,
132
  "step": 190
133
  },
134
  {
135
- "epoch": 0.3,
136
- "learning_rate": 9.502487562189055e-05,
137
- "loss": 0.3828,
138
  "step": 200
139
  },
140
  {
141
- "epoch": 0.3,
142
- "eval_accuracy": 0.9269521410579346,
143
- "eval_loss": 0.42952126264572144,
144
- "eval_runtime": 18.5768,
145
- "eval_samples_per_second": 64.112,
146
- "eval_steps_per_second": 8.021,
147
  "step": 200
148
  },
149
  {
150
- "epoch": 0.31,
151
- "learning_rate": 9.477611940298507e-05,
152
- "loss": 0.4261,
153
  "step": 210
154
  },
155
  {
156
- "epoch": 0.33,
157
- "learning_rate": 9.452736318407961e-05,
158
- "loss": 0.3445,
159
  "step": 220
160
  },
161
  {
162
- "epoch": 0.34,
163
- "learning_rate": 9.427860696517413e-05,
164
- "loss": 0.3152,
165
  "step": 230
166
  },
167
  {
168
- "epoch": 0.36,
169
- "learning_rate": 9.402985074626867e-05,
170
- "loss": 0.3448,
171
  "step": 240
172
  },
173
  {
174
- "epoch": 0.37,
175
- "learning_rate": 9.37810945273632e-05,
176
- "loss": 0.4312,
177
  "step": 250
178
  },
179
  {
180
- "epoch": 0.39,
181
- "learning_rate": 9.353233830845772e-05,
182
- "loss": 0.3472,
183
  "step": 260
184
  },
185
  {
186
- "epoch": 0.4,
187
- "learning_rate": 9.328358208955224e-05,
188
- "loss": 0.4248,
189
  "step": 270
190
  },
191
  {
192
- "epoch": 0.42,
193
- "learning_rate": 9.303482587064676e-05,
194
- "loss": 0.3977,
195
  "step": 280
196
  },
197
  {
198
- "epoch": 0.43,
199
- "learning_rate": 9.27860696517413e-05,
200
- "loss": 0.3164,
201
  "step": 290
202
  },
203
  {
204
- "epoch": 0.45,
205
- "learning_rate": 9.253731343283582e-05,
206
- "loss": 0.3623,
207
  "step": 300
208
  },
209
  {
210
- "epoch": 0.45,
211
- "eval_accuracy": 0.9370277078085643,
212
- "eval_loss": 0.30219167470932007,
213
- "eval_runtime": 23.4722,
214
- "eval_samples_per_second": 50.741,
215
- "eval_steps_per_second": 6.348,
216
  "step": 300
217
  },
218
  {
219
- "epoch": 0.46,
220
- "learning_rate": 9.228855721393035e-05,
221
- "loss": 0.2591,
222
  "step": 310
223
  },
224
  {
225
- "epoch": 0.48,
226
- "learning_rate": 9.203980099502488e-05,
227
- "loss": 0.277,
228
  "step": 320
229
  },
230
  {
231
- "epoch": 0.49,
232
- "learning_rate": 9.17910447761194e-05,
233
- "loss": 0.2457,
234
  "step": 330
235
  },
236
  {
237
- "epoch": 0.51,
238
- "learning_rate": 9.154228855721394e-05,
239
- "loss": 0.1788,
240
  "step": 340
241
  },
242
  {
243
- "epoch": 0.52,
244
- "learning_rate": 9.129353233830847e-05,
245
- "loss": 0.2828,
246
  "step": 350
247
  },
248
  {
249
- "epoch": 0.54,
250
- "learning_rate": 9.104477611940299e-05,
251
- "loss": 0.2579,
252
  "step": 360
253
  },
254
  {
255
- "epoch": 0.55,
256
- "learning_rate": 9.079601990049753e-05,
257
- "loss": 0.2195,
258
  "step": 370
259
  },
260
  {
261
- "epoch": 0.57,
262
- "learning_rate": 9.054726368159205e-05,
263
- "loss": 0.2773,
264
  "step": 380
265
  },
266
  {
267
- "epoch": 0.58,
268
- "learning_rate": 9.029850746268657e-05,
269
- "loss": 0.188,
270
  "step": 390
271
  },
272
  {
273
- "epoch": 0.6,
274
- "learning_rate": 9.00497512437811e-05,
275
- "loss": 0.2623,
276
  "step": 400
277
  },
278
  {
279
- "epoch": 0.6,
280
- "eval_accuracy": 0.9269521410579346,
281
- "eval_loss": 0.2882852256298065,
282
- "eval_runtime": 19.7432,
283
- "eval_samples_per_second": 60.325,
284
- "eval_steps_per_second": 7.547,
285
  "step": 400
286
  },
287
  {
288
- "epoch": 0.61,
289
- "learning_rate": 8.980099502487562e-05,
290
- "loss": 0.2402,
291
  "step": 410
292
  },
293
  {
294
- "epoch": 0.63,
295
- "learning_rate": 8.955223880597016e-05,
296
- "loss": 0.1562,
297
  "step": 420
298
  },
299
  {
300
- "epoch": 0.64,
301
- "learning_rate": 8.930348258706468e-05,
302
- "loss": 0.3121,
303
  "step": 430
304
  },
305
  {
306
- "epoch": 0.66,
307
- "learning_rate": 8.905472636815922e-05,
308
- "loss": 0.2505,
309
  "step": 440
310
  },
311
  {
312
- "epoch": 0.67,
313
- "learning_rate": 8.880597014925374e-05,
314
- "loss": 0.2329,
315
  "step": 450
316
  },
317
  {
318
- "epoch": 0.69,
319
- "learning_rate": 8.855721393034826e-05,
320
- "loss": 0.1912,
321
  "step": 460
322
  },
323
  {
324
- "epoch": 0.7,
325
- "learning_rate": 8.83084577114428e-05,
326
- "loss": 0.2406,
327
  "step": 470
328
  },
329
  {
330
- "epoch": 0.72,
331
- "learning_rate": 8.805970149253732e-05,
332
- "loss": 0.1853,
333
  "step": 480
334
  },
335
  {
336
- "epoch": 0.73,
337
- "learning_rate": 8.781094527363185e-05,
338
- "loss": 0.2545,
339
  "step": 490
340
  },
341
  {
342
- "epoch": 0.75,
343
- "learning_rate": 8.756218905472637e-05,
344
- "loss": 0.2251,
345
  "step": 500
346
  },
347
  {
348
- "epoch": 0.75,
349
- "eval_accuracy": 0.9462636439966414,
350
- "eval_loss": 0.23029384016990662,
351
- "eval_runtime": 20.0888,
352
- "eval_samples_per_second": 59.287,
353
- "eval_steps_per_second": 7.417,
354
  "step": 500
355
  },
356
  {
357
- "epoch": 0.76,
358
- "learning_rate": 8.731343283582089e-05,
359
- "loss": 0.1969,
360
  "step": 510
361
  },
362
  {
363
- "epoch": 0.78,
364
- "learning_rate": 8.706467661691543e-05,
365
- "loss": 0.1476,
366
  "step": 520
367
  },
368
  {
369
- "epoch": 0.79,
370
- "learning_rate": 8.681592039800995e-05,
371
- "loss": 0.2066,
372
  "step": 530
373
  },
374
  {
375
- "epoch": 0.81,
376
- "learning_rate": 8.656716417910447e-05,
377
- "loss": 0.3319,
378
  "step": 540
379
  },
380
  {
381
- "epoch": 0.82,
382
- "learning_rate": 8.631840796019901e-05,
383
- "loss": 0.1591,
384
  "step": 550
385
  },
386
- {
387
- "epoch": 0.84,
388
- "learning_rate": 8.606965174129353e-05,
389
- "loss": 0.1666,
390
- "step": 560
391
- },
392
- {
393
- "epoch": 0.85,
394
- "learning_rate": 8.582089552238807e-05,
395
- "loss": 0.2891,
396
- "step": 570
397
- },
398
- {
399
- "epoch": 0.87,
400
- "learning_rate": 8.55721393034826e-05,
401
- "loss": 0.178,
402
- "step": 580
403
- },
404
- {
405
- "epoch": 0.88,
406
- "learning_rate": 8.532338308457712e-05,
407
- "loss": 0.1507,
408
- "step": 590
409
- },
410
- {
411
- "epoch": 0.9,
412
- "learning_rate": 8.507462686567164e-05,
413
- "loss": 0.2695,
414
- "step": 600
415
- },
416
- {
417
- "epoch": 0.9,
418
- "eval_accuracy": 0.9378673383711167,
419
- "eval_loss": 0.2464931756258011,
420
- "eval_runtime": 19.6563,
421
- "eval_samples_per_second": 60.591,
422
- "eval_steps_per_second": 7.58,
423
- "step": 600
424
- },
425
- {
426
- "epoch": 0.91,
427
- "learning_rate": 8.482587064676616e-05,
428
- "loss": 0.2417,
429
- "step": 610
430
- },
431
- {
432
- "epoch": 0.93,
433
- "learning_rate": 8.45771144278607e-05,
434
- "loss": 0.2455,
435
- "step": 620
436
- },
437
  {
438
  "epoch": 0.94,
439
- "learning_rate": 8.432835820895522e-05,
440
- "loss": 0.1249,
441
- "step": 630
442
  },
443
  {
444
  "epoch": 0.96,
445
- "learning_rate": 8.407960199004975e-05,
446
- "loss": 0.1533,
447
- "step": 640
448
  },
449
  {
450
  "epoch": 0.97,
451
- "learning_rate": 8.383084577114428e-05,
452
- "loss": 0.1838,
453
- "step": 650
454
  },
455
  {
456
  "epoch": 0.99,
457
- "learning_rate": 8.360696517412937e-05,
458
- "loss": 0.1247,
459
- "step": 660
460
  },
461
  {
462
  "epoch": 1.0,
463
- "learning_rate": 8.335820895522388e-05,
464
- "loss": 0.1583,
465
- "step": 670
466
- },
467
- {
468
- "epoch": 1.01,
469
- "learning_rate": 8.310945273631841e-05,
470
- "loss": 0.1024,
471
- "step": 680
472
- },
473
- {
474
- "epoch": 1.03,
475
- "learning_rate": 8.286069651741294e-05,
476
- "loss": 0.0599,
477
- "step": 690
478
- },
479
- {
480
- "epoch": 1.04,
481
- "learning_rate": 8.261194029850746e-05,
482
- "loss": 0.1256,
483
- "step": 700
484
- },
485
- {
486
- "epoch": 1.04,
487
- "eval_accuracy": 0.9714525608732157,
488
- "eval_loss": 0.12489016354084015,
489
- "eval_runtime": 18.8609,
490
- "eval_samples_per_second": 63.146,
491
- "eval_steps_per_second": 7.9,
492
- "step": 700
493
- },
494
- {
495
- "epoch": 1.06,
496
- "learning_rate": 8.2363184079602e-05,
497
- "loss": 0.0541,
498
- "step": 710
499
- },
500
- {
501
- "epoch": 1.07,
502
- "learning_rate": 8.211442786069652e-05,
503
- "loss": 0.0742,
504
- "step": 720
505
- },
506
- {
507
- "epoch": 1.09,
508
- "learning_rate": 8.186567164179106e-05,
509
- "loss": 0.0975,
510
- "step": 730
511
- },
512
- {
513
- "epoch": 1.1,
514
- "learning_rate": 8.161691542288558e-05,
515
- "loss": 0.1011,
516
- "step": 740
517
- },
518
- {
519
- "epoch": 1.12,
520
- "learning_rate": 8.13681592039801e-05,
521
- "loss": 0.1081,
522
- "step": 750
523
- },
524
- {
525
- "epoch": 1.13,
526
- "learning_rate": 8.111940298507464e-05,
527
- "loss": 0.129,
528
- "step": 760
529
- },
530
- {
531
- "epoch": 1.15,
532
- "learning_rate": 8.087064676616916e-05,
533
- "loss": 0.1595,
534
- "step": 770
535
- },
536
- {
537
- "epoch": 1.16,
538
- "learning_rate": 8.062189054726368e-05,
539
- "loss": 0.163,
540
- "step": 780
541
- },
542
- {
543
- "epoch": 1.18,
544
- "learning_rate": 8.037313432835821e-05,
545
- "loss": 0.1464,
546
- "step": 790
547
- },
548
- {
549
- "epoch": 1.19,
550
- "learning_rate": 8.012437810945273e-05,
551
- "loss": 0.0718,
552
- "step": 800
553
- },
554
- {
555
- "epoch": 1.19,
556
- "eval_accuracy": 0.9622166246851386,
557
- "eval_loss": 0.13346683979034424,
558
- "eval_runtime": 19.3064,
559
- "eval_samples_per_second": 61.689,
560
- "eval_steps_per_second": 7.718,
561
- "step": 800
562
- },
563
- {
564
- "epoch": 1.21,
565
- "learning_rate": 7.987562189054727e-05,
566
- "loss": 0.0517,
567
- "step": 810
568
- },
569
- {
570
- "epoch": 1.22,
571
- "learning_rate": 7.962686567164179e-05,
572
- "loss": 0.1143,
573
- "step": 820
574
- },
575
- {
576
- "epoch": 1.24,
577
- "learning_rate": 7.937810945273633e-05,
578
- "loss": 0.062,
579
- "step": 830
580
- },
581
- {
582
- "epoch": 1.25,
583
- "learning_rate": 7.912935323383085e-05,
584
- "loss": 0.2013,
585
- "step": 840
586
- },
587
- {
588
- "epoch": 1.27,
589
- "learning_rate": 7.888059701492537e-05,
590
- "loss": 0.0995,
591
- "step": 850
592
- },
593
- {
594
- "epoch": 1.28,
595
- "learning_rate": 7.863184079601991e-05,
596
- "loss": 0.1009,
597
- "step": 860
598
- },
599
- {
600
- "epoch": 1.3,
601
- "learning_rate": 7.838308457711443e-05,
602
- "loss": 0.1292,
603
- "step": 870
604
- },
605
- {
606
- "epoch": 1.31,
607
- "learning_rate": 7.813432835820896e-05,
608
- "loss": 0.0848,
609
- "step": 880
610
- },
611
- {
612
- "epoch": 1.33,
613
- "learning_rate": 7.788557213930348e-05,
614
- "loss": 0.0323,
615
- "step": 890
616
- },
617
- {
618
- "epoch": 1.34,
619
- "learning_rate": 7.7636815920398e-05,
620
- "loss": 0.0436,
621
- "step": 900
622
- },
623
- {
624
- "epoch": 1.34,
625
- "eval_accuracy": 0.979009235936188,
626
- "eval_loss": 0.09892778843641281,
627
- "eval_runtime": 20.325,
628
- "eval_samples_per_second": 58.598,
629
- "eval_steps_per_second": 7.331,
630
- "step": 900
631
- },
632
- {
633
- "epoch": 1.36,
634
- "learning_rate": 7.738805970149254e-05,
635
- "loss": 0.0345,
636
- "step": 910
637
- },
638
- {
639
- "epoch": 1.37,
640
- "learning_rate": 7.713930348258706e-05,
641
- "loss": 0.0505,
642
- "step": 920
643
- },
644
- {
645
- "epoch": 1.39,
646
- "learning_rate": 7.68905472636816e-05,
647
- "loss": 0.0464,
648
- "step": 930
649
- },
650
- {
651
- "epoch": 1.4,
652
- "learning_rate": 7.664179104477612e-05,
653
- "loss": 0.0654,
654
- "step": 940
655
- },
656
- {
657
- "epoch": 1.42,
658
- "learning_rate": 7.639303482587065e-05,
659
- "loss": 0.0776,
660
- "step": 950
661
- },
662
- {
663
- "epoch": 1.43,
664
- "learning_rate": 7.614427860696518e-05,
665
- "loss": 0.0396,
666
- "step": 960
667
- },
668
- {
669
- "epoch": 1.45,
670
- "learning_rate": 7.589552238805971e-05,
671
- "loss": 0.0373,
672
- "step": 970
673
- },
674
- {
675
- "epoch": 1.46,
676
- "learning_rate": 7.564676616915424e-05,
677
- "loss": 0.083,
678
- "step": 980
679
- },
680
- {
681
- "epoch": 1.48,
682
- "learning_rate": 7.539800995024875e-05,
683
- "loss": 0.1018,
684
- "step": 990
685
- },
686
- {
687
- "epoch": 1.49,
688
- "learning_rate": 7.514925373134328e-05,
689
- "loss": 0.1681,
690
- "step": 1000
691
- },
692
- {
693
- "epoch": 1.49,
694
- "eval_accuracy": 0.9496221662468514,
695
- "eval_loss": 0.18528999388217926,
696
- "eval_runtime": 19.3383,
697
- "eval_samples_per_second": 61.588,
698
- "eval_steps_per_second": 7.705,
699
- "step": 1000
700
- },
701
- {
702
- "epoch": 1.51,
703
- "learning_rate": 7.490049751243781e-05,
704
- "loss": 0.0607,
705
- "step": 1010
706
- },
707
- {
708
- "epoch": 1.52,
709
- "learning_rate": 7.465174129353234e-05,
710
- "loss": 0.1343,
711
- "step": 1020
712
- },
713
- {
714
- "epoch": 1.54,
715
- "learning_rate": 7.440298507462687e-05,
716
- "loss": 0.1163,
717
- "step": 1030
718
- },
719
- {
720
- "epoch": 1.55,
721
- "learning_rate": 7.41542288557214e-05,
722
- "loss": 0.0628,
723
- "step": 1040
724
- },
725
- {
726
- "epoch": 1.57,
727
- "learning_rate": 7.390547263681592e-05,
728
- "loss": 0.1001,
729
- "step": 1050
730
- },
731
- {
732
- "epoch": 1.58,
733
- "learning_rate": 7.365671641791046e-05,
734
- "loss": 0.0764,
735
- "step": 1060
736
- },
737
- {
738
- "epoch": 1.6,
739
- "learning_rate": 7.340796019900498e-05,
740
- "loss": 0.0281,
741
- "step": 1070
742
- },
743
- {
744
- "epoch": 1.61,
745
- "learning_rate": 7.315920398009952e-05,
746
- "loss": 0.0861,
747
- "step": 1080
748
- },
749
- {
750
- "epoch": 1.63,
751
- "learning_rate": 7.291044776119404e-05,
752
- "loss": 0.0305,
753
- "step": 1090
754
- },
755
- {
756
- "epoch": 1.64,
757
- "learning_rate": 7.266169154228856e-05,
758
- "loss": 0.0771,
759
- "step": 1100
760
- },
761
- {
762
- "epoch": 1.64,
763
- "eval_accuracy": 0.964735516372796,
764
- "eval_loss": 0.14002813398838043,
765
- "eval_runtime": 19.5,
766
- "eval_samples_per_second": 61.077,
767
- "eval_steps_per_second": 7.641,
768
- "step": 1100
769
- },
770
- {
771
- "epoch": 1.66,
772
- "learning_rate": 7.241293532338309e-05,
773
- "loss": 0.1127,
774
- "step": 1110
775
- },
776
- {
777
- "epoch": 1.67,
778
- "learning_rate": 7.216417910447761e-05,
779
- "loss": 0.0412,
780
- "step": 1120
781
- },
782
- {
783
- "epoch": 1.69,
784
- "learning_rate": 7.191542288557215e-05,
785
- "loss": 0.1501,
786
- "step": 1130
787
- },
788
- {
789
- "epoch": 1.7,
790
- "learning_rate": 7.166666666666667e-05,
791
- "loss": 0.1287,
792
- "step": 1140
793
- },
794
- {
795
- "epoch": 1.72,
796
- "learning_rate": 7.141791044776119e-05,
797
- "loss": 0.0528,
798
- "step": 1150
799
- },
800
- {
801
- "epoch": 1.73,
802
- "learning_rate": 7.116915422885573e-05,
803
- "loss": 0.0139,
804
- "step": 1160
805
- },
806
- {
807
- "epoch": 1.75,
808
- "learning_rate": 7.092039800995025e-05,
809
- "loss": 0.0465,
810
- "step": 1170
811
- },
812
- {
813
- "epoch": 1.76,
814
- "learning_rate": 7.067164179104479e-05,
815
- "loss": 0.0412,
816
- "step": 1180
817
- },
818
- {
819
- "epoch": 1.78,
820
- "learning_rate": 7.042288557213931e-05,
821
- "loss": 0.0812,
822
- "step": 1190
823
- },
824
- {
825
- "epoch": 1.79,
826
- "learning_rate": 7.017412935323384e-05,
827
- "loss": 0.0815,
828
- "step": 1200
829
- },
830
- {
831
- "epoch": 1.79,
832
- "eval_accuracy": 0.9764903442485307,
833
- "eval_loss": 0.0969606265425682,
834
- "eval_runtime": 18.9862,
835
- "eval_samples_per_second": 62.73,
836
- "eval_steps_per_second": 7.848,
837
- "step": 1200
838
- },
839
- {
840
- "epoch": 1.81,
841
- "learning_rate": 6.992537313432836e-05,
842
- "loss": 0.0212,
843
- "step": 1210
844
- },
845
- {
846
- "epoch": 1.82,
847
- "learning_rate": 6.967661691542288e-05,
848
- "loss": 0.0576,
849
- "step": 1220
850
- },
851
- {
852
- "epoch": 1.84,
853
- "learning_rate": 6.942786069651742e-05,
854
- "loss": 0.0269,
855
- "step": 1230
856
- },
857
- {
858
- "epoch": 1.85,
859
- "learning_rate": 6.917910447761194e-05,
860
- "loss": 0.0557,
861
- "step": 1240
862
- },
863
- {
864
- "epoch": 1.87,
865
- "learning_rate": 6.893034825870647e-05,
866
- "loss": 0.1284,
867
- "step": 1250
868
- },
869
- {
870
- "epoch": 1.88,
871
- "learning_rate": 6.8681592039801e-05,
872
- "loss": 0.0499,
873
- "step": 1260
874
- },
875
- {
876
- "epoch": 1.9,
877
- "learning_rate": 6.843283582089553e-05,
878
- "loss": 0.114,
879
- "step": 1270
880
- },
881
- {
882
- "epoch": 1.91,
883
- "learning_rate": 6.818407960199006e-05,
884
- "loss": 0.0338,
885
- "step": 1280
886
- },
887
- {
888
- "epoch": 1.93,
889
- "learning_rate": 6.793532338308459e-05,
890
- "loss": 0.1359,
891
- "step": 1290
892
- },
893
- {
894
- "epoch": 1.94,
895
- "learning_rate": 6.768656716417911e-05,
896
- "loss": 0.0203,
897
- "step": 1300
898
- },
899
- {
900
- "epoch": 1.94,
901
- "eval_accuracy": 0.964735516372796,
902
- "eval_loss": 0.14076600968837738,
903
- "eval_runtime": 19.9477,
904
- "eval_samples_per_second": 59.706,
905
- "eval_steps_per_second": 7.47,
906
- "step": 1300
907
- },
908
- {
909
- "epoch": 1.96,
910
- "learning_rate": 6.743781094527363e-05,
911
- "loss": 0.0373,
912
- "step": 1310
913
- },
914
- {
915
- "epoch": 1.97,
916
- "learning_rate": 6.718905472636815e-05,
917
- "loss": 0.0751,
918
- "step": 1320
919
- },
920
- {
921
- "epoch": 1.99,
922
- "learning_rate": 6.694029850746268e-05,
923
- "loss": 0.0923,
924
- "step": 1330
925
- },
926
- {
927
- "epoch": 2.0,
928
- "learning_rate": 6.669154228855721e-05,
929
- "loss": 0.0406,
930
- "step": 1340
931
- },
932
- {
933
- "epoch": 2.01,
934
- "learning_rate": 6.644278606965174e-05,
935
- "loss": 0.0333,
936
- "step": 1350
937
- },
938
- {
939
- "epoch": 2.03,
940
- "learning_rate": 6.619402985074627e-05,
941
- "loss": 0.0674,
942
- "step": 1360
943
- },
944
- {
945
- "epoch": 2.04,
946
- "learning_rate": 6.59452736318408e-05,
947
- "loss": 0.0971,
948
- "step": 1370
949
- },
950
- {
951
- "epoch": 2.06,
952
- "learning_rate": 6.569651741293532e-05,
953
- "loss": 0.1701,
954
- "step": 1380
955
- },
956
- {
957
- "epoch": 2.07,
958
- "learning_rate": 6.544776119402986e-05,
959
- "loss": 0.1362,
960
- "step": 1390
961
- },
962
- {
963
- "epoch": 2.09,
964
- "learning_rate": 6.519900497512438e-05,
965
- "loss": 0.0774,
966
- "step": 1400
967
- },
968
- {
969
- "epoch": 2.09,
970
- "eval_accuracy": 0.966414777497901,
971
- "eval_loss": 0.12388637661933899,
972
- "eval_runtime": 19.0185,
973
- "eval_samples_per_second": 62.623,
974
- "eval_steps_per_second": 7.834,
975
- "step": 1400
976
- },
977
- {
978
- "epoch": 2.1,
979
- "learning_rate": 6.495024875621892e-05,
980
- "loss": 0.0306,
981
- "step": 1410
982
- },
983
- {
984
- "epoch": 2.12,
985
- "learning_rate": 6.470149253731344e-05,
986
- "loss": 0.0421,
987
- "step": 1420
988
- },
989
- {
990
- "epoch": 2.13,
991
- "learning_rate": 6.445273631840796e-05,
992
- "loss": 0.0408,
993
- "step": 1430
994
- },
995
- {
996
- "epoch": 2.15,
997
- "learning_rate": 6.420398009950249e-05,
998
- "loss": 0.0333,
999
- "step": 1440
1000
- },
1001
- {
1002
- "epoch": 2.16,
1003
- "learning_rate": 6.395522388059701e-05,
1004
- "loss": 0.0124,
1005
- "step": 1450
1006
- },
1007
- {
1008
- "epoch": 2.18,
1009
- "learning_rate": 6.370646766169155e-05,
1010
- "loss": 0.0546,
1011
- "step": 1460
1012
- },
1013
- {
1014
- "epoch": 2.19,
1015
- "learning_rate": 6.345771144278607e-05,
1016
- "loss": 0.0088,
1017
- "step": 1470
1018
- },
1019
- {
1020
- "epoch": 2.21,
1021
- "learning_rate": 6.32089552238806e-05,
1022
- "loss": 0.0525,
1023
- "step": 1480
1024
- },
1025
- {
1026
- "epoch": 2.22,
1027
- "learning_rate": 6.296019900497513e-05,
1028
- "loss": 0.0336,
1029
- "step": 1490
1030
- },
1031
- {
1032
- "epoch": 2.24,
1033
- "learning_rate": 6.271144278606965e-05,
1034
- "loss": 0.0723,
1035
- "step": 1500
1036
- },
1037
- {
1038
- "epoch": 2.24,
1039
- "eval_accuracy": 0.982367758186398,
1040
- "eval_loss": 0.06230420619249344,
1041
- "eval_runtime": 20.5805,
1042
- "eval_samples_per_second": 57.87,
1043
- "eval_steps_per_second": 7.24,
1044
- "step": 1500
1045
- },
1046
- {
1047
- "epoch": 2.25,
1048
- "learning_rate": 6.246268656716419e-05,
1049
- "loss": 0.04,
1050
- "step": 1510
1051
- },
1052
- {
1053
- "epoch": 2.27,
1054
- "learning_rate": 6.221393034825871e-05,
1055
- "loss": 0.029,
1056
- "step": 1520
1057
- },
1058
- {
1059
- "epoch": 2.28,
1060
- "learning_rate": 6.196517412935324e-05,
1061
- "loss": 0.0161,
1062
- "step": 1530
1063
- },
1064
- {
1065
- "epoch": 2.3,
1066
- "learning_rate": 6.171641791044776e-05,
1067
- "loss": 0.0307,
1068
- "step": 1540
1069
- },
1070
- {
1071
- "epoch": 2.31,
1072
- "learning_rate": 6.146766169154228e-05,
1073
- "loss": 0.0099,
1074
- "step": 1550
1075
- },
1076
- {
1077
- "epoch": 2.33,
1078
- "learning_rate": 6.121890547263682e-05,
1079
- "loss": 0.0814,
1080
- "step": 1560
1081
- },
1082
- {
1083
- "epoch": 2.34,
1084
- "learning_rate": 6.0970149253731343e-05,
1085
- "loss": 0.052,
1086
- "step": 1570
1087
- },
1088
- {
1089
- "epoch": 2.36,
1090
- "learning_rate": 6.0721393034825867e-05,
1091
- "loss": 0.0437,
1092
- "step": 1580
1093
- },
1094
- {
1095
- "epoch": 2.37,
1096
- "learning_rate": 6.04726368159204e-05,
1097
- "loss": 0.0411,
1098
- "step": 1590
1099
- },
1100
- {
1101
- "epoch": 2.39,
1102
- "learning_rate": 6.0223880597014927e-05,
1103
- "loss": 0.0187,
1104
- "step": 1600
1105
- },
1106
- {
1107
- "epoch": 2.39,
1108
- "eval_accuracy": 0.9731318219983207,
1109
- "eval_loss": 0.0926276370882988,
1110
- "eval_runtime": 20.5962,
1111
- "eval_samples_per_second": 57.826,
1112
- "eval_steps_per_second": 7.234,
1113
- "step": 1600
1114
- },
1115
- {
1116
- "epoch": 2.4,
1117
- "learning_rate": 5.9975124378109457e-05,
1118
- "loss": 0.0374,
1119
- "step": 1610
1120
- },
1121
- {
1122
- "epoch": 2.42,
1123
- "learning_rate": 5.972636815920398e-05,
1124
- "loss": 0.0216,
1125
- "step": 1620
1126
- },
1127
- {
1128
- "epoch": 2.43,
1129
- "learning_rate": 5.94776119402985e-05,
1130
- "loss": 0.0667,
1131
- "step": 1630
1132
- },
1133
- {
1134
- "epoch": 2.45,
1135
- "learning_rate": 5.922885572139304e-05,
1136
- "loss": 0.0691,
1137
- "step": 1640
1138
- },
1139
- {
1140
- "epoch": 2.46,
1141
- "learning_rate": 5.898009950248756e-05,
1142
- "loss": 0.0364,
1143
- "step": 1650
1144
- },
1145
- {
1146
- "epoch": 2.48,
1147
- "learning_rate": 5.87313432835821e-05,
1148
- "loss": 0.0212,
1149
- "step": 1660
1150
- },
1151
- {
1152
- "epoch": 2.49,
1153
- "learning_rate": 5.8482587064676616e-05,
1154
- "loss": 0.0209,
1155
- "step": 1670
1156
- },
1157
- {
1158
- "epoch": 2.51,
1159
- "learning_rate": 5.823383084577114e-05,
1160
- "loss": 0.012,
1161
- "step": 1680
1162
- },
1163
- {
1164
- "epoch": 2.52,
1165
- "learning_rate": 5.7985074626865676e-05,
1166
- "loss": 0.0118,
1167
- "step": 1690
1168
- },
1169
- {
1170
- "epoch": 2.54,
1171
- "learning_rate": 5.77363184079602e-05,
1172
- "loss": 0.055,
1173
- "step": 1700
1174
- },
1175
- {
1176
- "epoch": 2.54,
1177
- "eval_accuracy": 0.9756507136859782,
1178
- "eval_loss": 0.09100380539894104,
1179
- "eval_runtime": 19.4994,
1180
- "eval_samples_per_second": 61.079,
1181
- "eval_steps_per_second": 7.641,
1182
- "step": 1700
1183
- },
1184
- {
1185
- "epoch": 2.55,
1186
- "learning_rate": 5.7487562189054736e-05,
1187
- "loss": 0.0161,
1188
- "step": 1710
1189
- },
1190
- {
1191
- "epoch": 2.57,
1192
- "learning_rate": 5.723880597014926e-05,
1193
- "loss": 0.0071,
1194
- "step": 1720
1195
- },
1196
- {
1197
- "epoch": 2.58,
1198
- "learning_rate": 5.699004975124378e-05,
1199
- "loss": 0.0103,
1200
- "step": 1730
1201
- },
1202
- {
1203
- "epoch": 2.6,
1204
- "learning_rate": 5.674129353233831e-05,
1205
- "loss": 0.0706,
1206
- "step": 1740
1207
- },
1208
- {
1209
- "epoch": 2.61,
1210
- "learning_rate": 5.6492537313432836e-05,
1211
- "loss": 0.0523,
1212
- "step": 1750
1213
- },
1214
- {
1215
- "epoch": 2.63,
1216
- "learning_rate": 5.624378109452737e-05,
1217
- "loss": 0.0853,
1218
- "step": 1760
1219
- },
1220
- {
1221
- "epoch": 2.64,
1222
- "learning_rate": 5.5995024875621896e-05,
1223
- "loss": 0.027,
1224
- "step": 1770
1225
- },
1226
- {
1227
- "epoch": 2.66,
1228
- "learning_rate": 5.574626865671642e-05,
1229
- "loss": 0.0283,
1230
- "step": 1780
1231
- },
1232
- {
1233
- "epoch": 2.67,
1234
- "learning_rate": 5.549751243781095e-05,
1235
- "loss": 0.0135,
1236
- "step": 1790
1237
- },
1238
- {
1239
- "epoch": 2.69,
1240
- "learning_rate": 5.524875621890547e-05,
1241
- "loss": 0.0304,
1242
- "step": 1800
1243
- },
1244
- {
1245
- "epoch": 2.69,
1246
- "eval_accuracy": 0.9722921914357683,
1247
- "eval_loss": 0.12775705754756927,
1248
- "eval_runtime": 18.9368,
1249
- "eval_samples_per_second": 62.893,
1250
- "eval_steps_per_second": 7.868,
1251
- "step": 1800
1252
- },
1253
- {
1254
- "epoch": 2.7,
1255
- "learning_rate": 5.500000000000001e-05,
1256
- "loss": 0.0089,
1257
- "step": 1810
1258
- },
1259
- {
1260
- "epoch": 2.72,
1261
- "learning_rate": 5.475124378109453e-05,
1262
- "loss": 0.0697,
1263
- "step": 1820
1264
- },
1265
- {
1266
- "epoch": 2.73,
1267
- "learning_rate": 5.4502487562189055e-05,
1268
- "loss": 0.0094,
1269
- "step": 1830
1270
- },
1271
- {
1272
- "epoch": 2.75,
1273
- "learning_rate": 5.4253731343283585e-05,
1274
- "loss": 0.0103,
1275
- "step": 1840
1276
- },
1277
- {
1278
- "epoch": 2.76,
1279
- "learning_rate": 5.400497512437811e-05,
1280
- "loss": 0.0399,
1281
- "step": 1850
1282
- },
1283
- {
1284
- "epoch": 2.78,
1285
- "learning_rate": 5.3756218905472645e-05,
1286
- "loss": 0.0181,
1287
- "step": 1860
1288
- },
1289
- {
1290
- "epoch": 2.79,
1291
- "learning_rate": 5.350746268656717e-05,
1292
- "loss": 0.0129,
1293
- "step": 1870
1294
- },
1295
- {
1296
- "epoch": 2.81,
1297
- "learning_rate": 5.325870646766169e-05,
1298
- "loss": 0.0096,
1299
- "step": 1880
1300
- },
1301
- {
1302
- "epoch": 2.82,
1303
- "learning_rate": 5.300995024875622e-05,
1304
- "loss": 0.0215,
1305
- "step": 1890
1306
- },
1307
- {
1308
- "epoch": 2.84,
1309
- "learning_rate": 5.2761194029850745e-05,
1310
- "loss": 0.0068,
1311
- "step": 1900
1312
- },
1313
- {
1314
- "epoch": 2.84,
1315
- "eval_accuracy": 0.9680940386230059,
1316
- "eval_loss": 0.12825354933738708,
1317
- "eval_runtime": 20.818,
1318
- "eval_samples_per_second": 57.21,
1319
- "eval_steps_per_second": 7.157,
1320
- "step": 1900
1321
- },
1322
- {
1323
- "epoch": 2.85,
1324
- "learning_rate": 5.251243781094528e-05,
1325
- "loss": 0.0513,
1326
- "step": 1910
1327
- },
1328
- {
1329
- "epoch": 2.87,
1330
- "learning_rate": 5.2263681592039805e-05,
1331
- "loss": 0.0949,
1332
- "step": 1920
1333
- },
1334
- {
1335
- "epoch": 2.88,
1336
- "learning_rate": 5.201492537313433e-05,
1337
- "loss": 0.0435,
1338
- "step": 1930
1339
- },
1340
- {
1341
- "epoch": 2.9,
1342
- "learning_rate": 5.176616915422886e-05,
1343
- "loss": 0.0382,
1344
- "step": 1940
1345
- },
1346
- {
1347
- "epoch": 2.91,
1348
- "learning_rate": 5.151741293532338e-05,
1349
- "loss": 0.0579,
1350
- "step": 1950
1351
- },
1352
- {
1353
- "epoch": 2.93,
1354
- "learning_rate": 5.126865671641792e-05,
1355
- "loss": 0.0151,
1356
- "step": 1960
1357
- },
1358
- {
1359
- "epoch": 2.94,
1360
- "learning_rate": 5.101990049751244e-05,
1361
- "loss": 0.051,
1362
- "step": 1970
1363
- },
1364
- {
1365
- "epoch": 2.96,
1366
- "learning_rate": 5.0771144278606964e-05,
1367
- "loss": 0.0474,
1368
- "step": 1980
1369
- },
1370
- {
1371
- "epoch": 2.97,
1372
- "learning_rate": 5.05223880597015e-05,
1373
- "loss": 0.0227,
1374
- "step": 1990
1375
- },
1376
- {
1377
- "epoch": 2.99,
1378
- "learning_rate": 5.027363184079602e-05,
1379
- "loss": 0.0319,
1380
- "step": 2000
1381
- },
1382
- {
1383
- "epoch": 2.99,
1384
- "eval_accuracy": 0.9865659109991604,
1385
- "eval_loss": 0.0589967742562294,
1386
- "eval_runtime": 19.1134,
1387
- "eval_samples_per_second": 62.312,
1388
- "eval_steps_per_second": 7.796,
1389
- "step": 2000
1390
- },
1391
- {
1392
- "epoch": 3.0,
1393
- "learning_rate": 5.0024875621890554e-05,
1394
- "loss": 0.013,
1395
- "step": 2010
1396
- },
1397
- {
1398
- "epoch": 3.01,
1399
- "learning_rate": 4.977611940298508e-05,
1400
- "loss": 0.0053,
1401
- "step": 2020
1402
- },
1403
- {
1404
- "epoch": 3.03,
1405
- "learning_rate": 4.952736318407961e-05,
1406
- "loss": 0.036,
1407
- "step": 2030
1408
- },
1409
- {
1410
- "epoch": 3.04,
1411
- "learning_rate": 4.927860696517414e-05,
1412
- "loss": 0.005,
1413
- "step": 2040
1414
- },
1415
- {
1416
- "epoch": 3.06,
1417
- "learning_rate": 4.902985074626866e-05,
1418
- "loss": 0.0061,
1419
- "step": 2050
1420
- },
1421
- {
1422
- "epoch": 3.07,
1423
- "learning_rate": 4.8781094527363184e-05,
1424
- "loss": 0.052,
1425
- "step": 2060
1426
- },
1427
- {
1428
- "epoch": 3.09,
1429
- "learning_rate": 4.8532338308457714e-05,
1430
- "loss": 0.0093,
1431
- "step": 2070
1432
- },
1433
- {
1434
- "epoch": 3.1,
1435
- "learning_rate": 4.8283582089552244e-05,
1436
- "loss": 0.0055,
1437
- "step": 2080
1438
- },
1439
- {
1440
- "epoch": 3.12,
1441
- "learning_rate": 4.803482587064677e-05,
1442
- "loss": 0.0194,
1443
- "step": 2090
1444
- },
1445
- {
1446
- "epoch": 3.13,
1447
- "learning_rate": 4.77860696517413e-05,
1448
- "loss": 0.0054,
1449
- "step": 2100
1450
- },
1451
- {
1452
- "epoch": 3.13,
1453
- "eval_accuracy": 0.982367758186398,
1454
- "eval_loss": 0.07611096650362015,
1455
- "eval_runtime": 19.2367,
1456
- "eval_samples_per_second": 61.913,
1457
- "eval_steps_per_second": 7.746,
1458
- "step": 2100
1459
- },
1460
- {
1461
- "epoch": 3.15,
1462
- "learning_rate": 4.753731343283582e-05,
1463
- "loss": 0.0433,
1464
- "step": 2110
1465
- },
1466
- {
1467
- "epoch": 3.16,
1468
- "learning_rate": 4.728855721393035e-05,
1469
- "loss": 0.0047,
1470
- "step": 2120
1471
- },
1472
- {
1473
- "epoch": 3.18,
1474
- "learning_rate": 4.703980099502488e-05,
1475
- "loss": 0.005,
1476
- "step": 2130
1477
- },
1478
- {
1479
- "epoch": 3.19,
1480
- "learning_rate": 4.67910447761194e-05,
1481
- "loss": 0.0087,
1482
- "step": 2140
1483
- },
1484
- {
1485
- "epoch": 3.21,
1486
- "learning_rate": 4.654228855721393e-05,
1487
- "loss": 0.0045,
1488
- "step": 2150
1489
- },
1490
- {
1491
- "epoch": 3.22,
1492
- "learning_rate": 4.6293532338308456e-05,
1493
- "loss": 0.0229,
1494
- "step": 2160
1495
- },
1496
- {
1497
- "epoch": 3.24,
1498
- "learning_rate": 4.6044776119402986e-05,
1499
- "loss": 0.0043,
1500
- "step": 2170
1501
- },
1502
- {
1503
- "epoch": 3.25,
1504
- "learning_rate": 4.5796019900497516e-05,
1505
- "loss": 0.006,
1506
- "step": 2180
1507
- },
1508
- {
1509
- "epoch": 3.27,
1510
- "learning_rate": 4.554726368159204e-05,
1511
- "loss": 0.0361,
1512
- "step": 2190
1513
- },
1514
- {
1515
- "epoch": 3.28,
1516
- "learning_rate": 4.529850746268657e-05,
1517
- "loss": 0.0136,
1518
- "step": 2200
1519
- },
1520
- {
1521
- "epoch": 3.28,
1522
- "eval_accuracy": 0.982367758186398,
1523
- "eval_loss": 0.07589124888181686,
1524
- "eval_runtime": 18.4299,
1525
- "eval_samples_per_second": 64.623,
1526
- "eval_steps_per_second": 8.085,
1527
- "step": 2200
1528
- },
1529
- {
1530
- "epoch": 3.3,
1531
- "learning_rate": 4.50497512437811e-05,
1532
- "loss": 0.0046,
1533
- "step": 2210
1534
- },
1535
- {
1536
- "epoch": 3.31,
1537
- "learning_rate": 4.480099502487562e-05,
1538
- "loss": 0.0136,
1539
- "step": 2220
1540
- },
1541
- {
1542
- "epoch": 3.33,
1543
- "learning_rate": 4.455223880597015e-05,
1544
- "loss": 0.0042,
1545
- "step": 2230
1546
- },
1547
- {
1548
- "epoch": 3.34,
1549
- "learning_rate": 4.4303482587064676e-05,
1550
- "loss": 0.0344,
1551
- "step": 2240
1552
- },
1553
- {
1554
- "epoch": 3.36,
1555
- "learning_rate": 4.4054726368159206e-05,
1556
- "loss": 0.0112,
1557
- "step": 2250
1558
- },
1559
- {
1560
- "epoch": 3.37,
1561
- "learning_rate": 4.3805970149253736e-05,
1562
- "loss": 0.0401,
1563
- "step": 2260
1564
- },
1565
- {
1566
- "epoch": 3.39,
1567
- "learning_rate": 4.355721393034826e-05,
1568
- "loss": 0.0255,
1569
- "step": 2270
1570
- },
1571
- {
1572
- "epoch": 3.4,
1573
- "learning_rate": 4.330845771144279e-05,
1574
- "loss": 0.0053,
1575
- "step": 2280
1576
- },
1577
- {
1578
- "epoch": 3.42,
1579
- "learning_rate": 4.305970149253731e-05,
1580
- "loss": 0.0223,
1581
- "step": 2290
1582
- },
1583
- {
1584
- "epoch": 3.43,
1585
- "learning_rate": 4.281094527363184e-05,
1586
- "loss": 0.0136,
1587
- "step": 2300
1588
- },
1589
- {
1590
- "epoch": 3.43,
1591
- "eval_accuracy": 0.9865659109991604,
1592
- "eval_loss": 0.04480349272489548,
1593
- "eval_runtime": 19.7958,
1594
- "eval_samples_per_second": 60.164,
1595
- "eval_steps_per_second": 7.527,
1596
- "step": 2300
1597
- },
1598
- {
1599
- "epoch": 3.45,
1600
- "learning_rate": 4.256218905472637e-05,
1601
- "loss": 0.0552,
1602
- "step": 2310
1603
- },
1604
- {
1605
- "epoch": 3.46,
1606
- "learning_rate": 4.2313432835820895e-05,
1607
- "loss": 0.0041,
1608
- "step": 2320
1609
- },
1610
- {
1611
- "epoch": 3.48,
1612
- "learning_rate": 4.2064676616915425e-05,
1613
- "loss": 0.004,
1614
- "step": 2330
1615
- },
1616
- {
1617
- "epoch": 3.49,
1618
- "learning_rate": 4.181592039800995e-05,
1619
- "loss": 0.0041,
1620
- "step": 2340
1621
- },
1622
- {
1623
- "epoch": 3.51,
1624
- "learning_rate": 4.156716417910448e-05,
1625
- "loss": 0.012,
1626
- "step": 2350
1627
- },
1628
- {
1629
- "epoch": 3.52,
1630
- "learning_rate": 4.131840796019901e-05,
1631
- "loss": 0.0039,
1632
- "step": 2360
1633
- },
1634
- {
1635
- "epoch": 3.54,
1636
- "learning_rate": 4.106965174129354e-05,
1637
- "loss": 0.0052,
1638
- "step": 2370
1639
- },
1640
- {
1641
- "epoch": 3.55,
1642
- "learning_rate": 4.082089552238806e-05,
1643
- "loss": 0.0049,
1644
- "step": 2380
1645
- },
1646
- {
1647
- "epoch": 3.57,
1648
- "learning_rate": 4.0572139303482585e-05,
1649
- "loss": 0.0297,
1650
- "step": 2390
1651
- },
1652
- {
1653
- "epoch": 3.58,
1654
- "learning_rate": 4.0323383084577115e-05,
1655
- "loss": 0.0086,
1656
- "step": 2400
1657
- },
1658
- {
1659
- "epoch": 3.58,
1660
- "eval_accuracy": 0.9848866498740554,
1661
- "eval_loss": 0.08157633990049362,
1662
- "eval_runtime": 19.2031,
1663
- "eval_samples_per_second": 62.021,
1664
- "eval_steps_per_second": 7.759,
1665
- "step": 2400
1666
- },
1667
- {
1668
- "epoch": 3.6,
1669
- "learning_rate": 4.0074626865671645e-05,
1670
- "loss": 0.0201,
1671
- "step": 2410
1672
- },
1673
- {
1674
- "epoch": 3.61,
1675
- "learning_rate": 3.9825870646766175e-05,
1676
- "loss": 0.0043,
1677
- "step": 2420
1678
- },
1679
- {
1680
- "epoch": 3.63,
1681
- "learning_rate": 3.95771144278607e-05,
1682
- "loss": 0.0037,
1683
- "step": 2430
1684
- },
1685
- {
1686
- "epoch": 3.64,
1687
- "learning_rate": 3.932835820895522e-05,
1688
- "loss": 0.0038,
1689
- "step": 2440
1690
- },
1691
- {
1692
- "epoch": 3.66,
1693
- "learning_rate": 3.907960199004975e-05,
1694
- "loss": 0.0698,
1695
- "step": 2450
1696
- },
1697
- {
1698
- "epoch": 3.67,
1699
- "learning_rate": 3.883084577114428e-05,
1700
- "loss": 0.0037,
1701
- "step": 2460
1702
- },
1703
- {
1704
- "epoch": 3.69,
1705
- "learning_rate": 3.858208955223881e-05,
1706
- "loss": 0.0039,
1707
- "step": 2470
1708
- },
1709
- {
1710
- "epoch": 3.7,
1711
- "learning_rate": 3.8333333333333334e-05,
1712
- "loss": 0.0267,
1713
- "step": 2480
1714
- },
1715
- {
1716
- "epoch": 3.72,
1717
- "learning_rate": 3.808457711442786e-05,
1718
- "loss": 0.0035,
1719
- "step": 2490
1720
- },
1721
- {
1722
- "epoch": 3.73,
1723
- "learning_rate": 3.783582089552239e-05,
1724
- "loss": 0.0212,
1725
- "step": 2500
1726
- },
1727
- {
1728
- "epoch": 3.73,
1729
- "eval_accuracy": 0.9848866498740554,
1730
- "eval_loss": 0.057428497821092606,
1731
- "eval_runtime": 19.0519,
1732
- "eval_samples_per_second": 62.513,
1733
- "eval_steps_per_second": 7.821,
1734
- "step": 2500
1735
- },
1736
- {
1737
- "epoch": 3.75,
1738
- "learning_rate": 3.758706467661692e-05,
1739
- "loss": 0.0045,
1740
- "step": 2510
1741
- },
1742
- {
1743
- "epoch": 3.76,
1744
- "learning_rate": 3.733830845771145e-05,
1745
- "loss": 0.0036,
1746
- "step": 2520
1747
- },
1748
- {
1749
- "epoch": 3.78,
1750
- "learning_rate": 3.708955223880598e-05,
1751
- "loss": 0.0033,
1752
- "step": 2530
1753
- },
1754
- {
1755
- "epoch": 3.79,
1756
- "learning_rate": 3.68407960199005e-05,
1757
- "loss": 0.0033,
1758
- "step": 2540
1759
- },
1760
- {
1761
- "epoch": 3.81,
1762
- "learning_rate": 3.6592039800995024e-05,
1763
- "loss": 0.0033,
1764
- "step": 2550
1765
- },
1766
- {
1767
- "epoch": 3.82,
1768
- "learning_rate": 3.6343283582089554e-05,
1769
- "loss": 0.0033,
1770
- "step": 2560
1771
- },
1772
- {
1773
- "epoch": 3.84,
1774
- "learning_rate": 3.6094527363184084e-05,
1775
- "loss": 0.0034,
1776
- "step": 2570
1777
- },
1778
- {
1779
- "epoch": 3.85,
1780
- "learning_rate": 3.584577114427861e-05,
1781
- "loss": 0.0031,
1782
- "step": 2580
1783
- },
1784
- {
1785
- "epoch": 3.87,
1786
- "learning_rate": 3.559701492537314e-05,
1787
- "loss": 0.0034,
1788
- "step": 2590
1789
- },
1790
- {
1791
- "epoch": 3.88,
1792
- "learning_rate": 3.534825870646766e-05,
1793
- "loss": 0.0392,
1794
- "step": 2600
1795
- },
1796
- {
1797
- "epoch": 3.88,
1798
- "eval_accuracy": 0.982367758186398,
1799
- "eval_loss": 0.07654974609613419,
1800
- "eval_runtime": 20.1007,
1801
- "eval_samples_per_second": 59.252,
1802
- "eval_steps_per_second": 7.413,
1803
- "step": 2600
1804
- },
1805
- {
1806
- "epoch": 3.9,
1807
- "learning_rate": 3.509950248756219e-05,
1808
- "loss": 0.0307,
1809
- "step": 2610
1810
- },
1811
- {
1812
- "epoch": 3.91,
1813
- "learning_rate": 3.485074626865672e-05,
1814
- "loss": 0.0075,
1815
- "step": 2620
1816
- },
1817
- {
1818
- "epoch": 3.93,
1819
- "learning_rate": 3.4601990049751244e-05,
1820
- "loss": 0.0325,
1821
- "step": 2630
1822
- },
1823
- {
1824
- "epoch": 3.94,
1825
- "learning_rate": 3.4353233830845774e-05,
1826
- "loss": 0.0032,
1827
- "step": 2640
1828
- },
1829
- {
1830
- "epoch": 3.96,
1831
- "learning_rate": 3.41044776119403e-05,
1832
- "loss": 0.0031,
1833
- "step": 2650
1834
- },
1835
- {
1836
- "epoch": 3.97,
1837
- "learning_rate": 3.385572139303483e-05,
1838
- "loss": 0.0151,
1839
- "step": 2660
1840
- },
1841
- {
1842
- "epoch": 3.99,
1843
- "learning_rate": 3.360696517412936e-05,
1844
- "loss": 0.0048,
1845
- "step": 2670
1846
- },
1847
- {
1848
- "epoch": 4.0,
1849
- "learning_rate": 3.335820895522388e-05,
1850
- "loss": 0.003,
1851
- "step": 2680
1852
- },
1853
- {
1854
- "epoch": 4.01,
1855
- "learning_rate": 3.310945273631841e-05,
1856
- "loss": 0.003,
1857
- "step": 2690
1858
- },
1859
- {
1860
- "epoch": 4.03,
1861
- "learning_rate": 3.286069651741294e-05,
1862
- "loss": 0.0029,
1863
- "step": 2700
1864
- },
1865
- {
1866
- "epoch": 4.03,
1867
- "eval_accuracy": 0.979009235936188,
1868
- "eval_loss": 0.073957659304142,
1869
- "eval_runtime": 19.9521,
1870
- "eval_samples_per_second": 59.693,
1871
- "eval_steps_per_second": 7.468,
1872
- "step": 2700
1873
- },
1874
- {
1875
- "epoch": 4.04,
1876
- "learning_rate": 3.261194029850746e-05,
1877
- "loss": 0.0036,
1878
- "step": 2710
1879
- },
1880
- {
1881
- "epoch": 4.06,
1882
- "learning_rate": 3.236318407960199e-05,
1883
- "loss": 0.0039,
1884
- "step": 2720
1885
- },
1886
- {
1887
- "epoch": 4.07,
1888
- "learning_rate": 3.2114427860696516e-05,
1889
- "loss": 0.0026,
1890
- "step": 2730
1891
- },
1892
- {
1893
- "epoch": 4.09,
1894
- "learning_rate": 3.1865671641791046e-05,
1895
- "loss": 0.0031,
1896
- "step": 2740
1897
- },
1898
- {
1899
- "epoch": 4.1,
1900
- "learning_rate": 3.1616915422885576e-05,
1901
- "loss": 0.0031,
1902
- "step": 2750
1903
- },
1904
- {
1905
- "epoch": 4.12,
1906
- "learning_rate": 3.13681592039801e-05,
1907
- "loss": 0.0027,
1908
- "step": 2760
1909
- },
1910
- {
1911
- "epoch": 4.13,
1912
- "learning_rate": 3.111940298507463e-05,
1913
- "loss": 0.0029,
1914
- "step": 2770
1915
- },
1916
- {
1917
- "epoch": 4.15,
1918
- "learning_rate": 3.087064676616915e-05,
1919
- "loss": 0.0029,
1920
- "step": 2780
1921
- },
1922
- {
1923
- "epoch": 4.16,
1924
- "learning_rate": 3.062189054726368e-05,
1925
- "loss": 0.0028,
1926
- "step": 2790
1927
- },
1928
- {
1929
- "epoch": 4.18,
1930
- "learning_rate": 3.037313432835821e-05,
1931
- "loss": 0.0521,
1932
- "step": 2800
1933
- },
1934
- {
1935
- "epoch": 4.18,
1936
- "eval_accuracy": 0.9924433249370277,
1937
- "eval_loss": 0.03492419794201851,
1938
- "eval_runtime": 20.5925,
1939
- "eval_samples_per_second": 57.837,
1940
- "eval_steps_per_second": 7.236,
1941
- "step": 2800
1942
- },
1943
- {
1944
- "epoch": 4.19,
1945
- "learning_rate": 3.012437810945274e-05,
1946
- "loss": 0.0029,
1947
- "step": 2810
1948
- },
1949
- {
1950
- "epoch": 4.21,
1951
- "learning_rate": 2.9875621890547266e-05,
1952
- "loss": 0.0029,
1953
- "step": 2820
1954
- },
1955
- {
1956
- "epoch": 4.22,
1957
- "learning_rate": 2.962686567164179e-05,
1958
- "loss": 0.0027,
1959
- "step": 2830
1960
- },
1961
- {
1962
- "epoch": 4.24,
1963
- "learning_rate": 2.937810945273632e-05,
1964
- "loss": 0.0029,
1965
- "step": 2840
1966
- },
1967
- {
1968
- "epoch": 4.25,
1969
- "learning_rate": 2.912935323383085e-05,
1970
- "loss": 0.0026,
1971
- "step": 2850
1972
- },
1973
- {
1974
- "epoch": 4.27,
1975
- "learning_rate": 2.8880597014925376e-05,
1976
- "loss": 0.004,
1977
- "step": 2860
1978
- },
1979
- {
1980
- "epoch": 4.28,
1981
- "learning_rate": 2.8631840796019905e-05,
1982
- "loss": 0.0027,
1983
- "step": 2870
1984
- },
1985
- {
1986
- "epoch": 4.3,
1987
- "learning_rate": 2.838308457711443e-05,
1988
- "loss": 0.0028,
1989
- "step": 2880
1990
- },
1991
- {
1992
- "epoch": 4.31,
1993
- "learning_rate": 2.8134328358208955e-05,
1994
- "loss": 0.0029,
1995
- "step": 2890
1996
- },
1997
- {
1998
- "epoch": 4.33,
1999
- "learning_rate": 2.7885572139303485e-05,
2000
- "loss": 0.0025,
2001
- "step": 2900
2002
- },
2003
- {
2004
- "epoch": 4.33,
2005
- "eval_accuracy": 0.9865659109991604,
2006
- "eval_loss": 0.06394174695014954,
2007
- "eval_runtime": 19.857,
2008
- "eval_samples_per_second": 59.979,
2009
- "eval_steps_per_second": 7.504,
2010
- "step": 2900
2011
- },
2012
- {
2013
- "epoch": 4.34,
2014
- "learning_rate": 2.7636815920398012e-05,
2015
- "loss": 0.0028,
2016
- "step": 2910
2017
- },
2018
- {
2019
- "epoch": 4.36,
2020
- "learning_rate": 2.7388059701492542e-05,
2021
- "loss": 0.0027,
2022
- "step": 2920
2023
- },
2024
- {
2025
- "epoch": 4.37,
2026
- "learning_rate": 2.7139303482587065e-05,
2027
- "loss": 0.0026,
2028
- "step": 2930
2029
- },
2030
- {
2031
- "epoch": 4.39,
2032
- "learning_rate": 2.689054726368159e-05,
2033
- "loss": 0.0026,
2034
- "step": 2940
2035
- },
2036
- {
2037
- "epoch": 4.4,
2038
- "learning_rate": 2.664179104477612e-05,
2039
- "loss": 0.0025,
2040
- "step": 2950
2041
- },
2042
- {
2043
- "epoch": 4.42,
2044
- "learning_rate": 2.6393034825870648e-05,
2045
- "loss": 0.0025,
2046
- "step": 2960
2047
- },
2048
- {
2049
- "epoch": 4.43,
2050
- "learning_rate": 2.6144278606965178e-05,
2051
- "loss": 0.0027,
2052
- "step": 2970
2053
- },
2054
- {
2055
- "epoch": 4.45,
2056
- "learning_rate": 2.58955223880597e-05,
2057
- "loss": 0.0026,
2058
- "step": 2980
2059
- },
2060
- {
2061
- "epoch": 4.46,
2062
- "learning_rate": 2.5646766169154228e-05,
2063
- "loss": 0.0025,
2064
- "step": 2990
2065
- },
2066
- {
2067
- "epoch": 4.48,
2068
- "learning_rate": 2.5398009950248758e-05,
2069
- "loss": 0.0024,
2070
- "step": 3000
2071
- },
2072
- {
2073
- "epoch": 4.48,
2074
- "eval_accuracy": 0.9882451721242653,
2075
- "eval_loss": 0.050867918878793716,
2076
- "eval_runtime": 19.0308,
2077
- "eval_samples_per_second": 62.583,
2078
- "eval_steps_per_second": 7.829,
2079
- "step": 3000
2080
- },
2081
- {
2082
- "epoch": 4.49,
2083
- "learning_rate": 2.5149253731343288e-05,
2084
- "loss": 0.0033,
2085
- "step": 3010
2086
- },
2087
- {
2088
- "epoch": 4.51,
2089
- "learning_rate": 2.490049751243781e-05,
2090
- "loss": 0.0026,
2091
- "step": 3020
2092
- },
2093
- {
2094
- "epoch": 4.52,
2095
- "learning_rate": 2.465174129353234e-05,
2096
- "loss": 0.0026,
2097
- "step": 3030
2098
- },
2099
- {
2100
- "epoch": 4.54,
2101
- "learning_rate": 2.4402985074626868e-05,
2102
- "loss": 0.0025,
2103
- "step": 3040
2104
- },
2105
- {
2106
- "epoch": 4.55,
2107
- "learning_rate": 2.4154228855721394e-05,
2108
- "loss": 0.0024,
2109
- "step": 3050
2110
- },
2111
- {
2112
- "epoch": 4.57,
2113
- "learning_rate": 2.390547263681592e-05,
2114
- "loss": 0.0025,
2115
- "step": 3060
2116
- },
2117
- {
2118
- "epoch": 4.58,
2119
- "learning_rate": 2.3656716417910448e-05,
2120
- "loss": 0.0025,
2121
- "step": 3070
2122
- },
2123
- {
2124
- "epoch": 4.6,
2125
- "learning_rate": 2.3407960199004977e-05,
2126
- "loss": 0.0361,
2127
- "step": 3080
2128
- },
2129
- {
2130
- "epoch": 4.61,
2131
- "learning_rate": 2.3159203980099504e-05,
2132
- "loss": 0.0024,
2133
- "step": 3090
2134
- },
2135
- {
2136
- "epoch": 4.63,
2137
- "learning_rate": 2.291044776119403e-05,
2138
- "loss": 0.0024,
2139
- "step": 3100
2140
- },
2141
- {
2142
- "epoch": 4.63,
2143
- "eval_accuracy": 0.9899244332493703,
2144
- "eval_loss": 0.0467989481985569,
2145
- "eval_runtime": 19.003,
2146
- "eval_samples_per_second": 62.674,
2147
- "eval_steps_per_second": 7.841,
2148
- "step": 3100
2149
- },
2150
- {
2151
- "epoch": 4.64,
2152
- "learning_rate": 2.2661691542288557e-05,
2153
- "loss": 0.0037,
2154
- "step": 3110
2155
- },
2156
- {
2157
- "epoch": 4.66,
2158
- "learning_rate": 2.2412935323383087e-05,
2159
- "loss": 0.0125,
2160
- "step": 3120
2161
- },
2162
- {
2163
- "epoch": 4.67,
2164
- "learning_rate": 2.2164179104477614e-05,
2165
- "loss": 0.0025,
2166
- "step": 3130
2167
- },
2168
- {
2169
- "epoch": 4.69,
2170
- "learning_rate": 2.191542288557214e-05,
2171
- "loss": 0.0026,
2172
- "step": 3140
2173
- },
2174
- {
2175
- "epoch": 4.7,
2176
- "learning_rate": 2.1666666666666667e-05,
2177
- "loss": 0.0026,
2178
- "step": 3150
2179
- },
2180
- {
2181
- "epoch": 4.72,
2182
- "learning_rate": 2.1417910447761194e-05,
2183
- "loss": 0.0023,
2184
- "step": 3160
2185
- },
2186
- {
2187
- "epoch": 4.73,
2188
- "learning_rate": 2.1169154228855724e-05,
2189
- "loss": 0.0022,
2190
- "step": 3170
2191
- },
2192
- {
2193
- "epoch": 4.75,
2194
- "learning_rate": 2.092039800995025e-05,
2195
- "loss": 0.0024,
2196
- "step": 3180
2197
- },
2198
- {
2199
- "epoch": 4.76,
2200
- "learning_rate": 2.0671641791044777e-05,
2201
- "loss": 0.0023,
2202
- "step": 3190
2203
- },
2204
- {
2205
- "epoch": 4.78,
2206
- "learning_rate": 2.0422885572139307e-05,
2207
- "loss": 0.0024,
2208
- "step": 3200
2209
- },
2210
- {
2211
- "epoch": 4.78,
2212
- "eval_accuracy": 0.9882451721242653,
2213
- "eval_loss": 0.052428748458623886,
2214
- "eval_runtime": 20.9205,
2215
- "eval_samples_per_second": 56.93,
2216
- "eval_steps_per_second": 7.122,
2217
- "step": 3200
2218
- },
2219
- {
2220
- "epoch": 4.79,
2221
- "learning_rate": 2.017412935323383e-05,
2222
- "loss": 0.0132,
2223
- "step": 3210
2224
- },
2225
- {
2226
- "epoch": 4.81,
2227
- "learning_rate": 1.992537313432836e-05,
2228
- "loss": 0.0023,
2229
- "step": 3220
2230
- },
2231
- {
2232
- "epoch": 4.82,
2233
- "learning_rate": 1.9676616915422887e-05,
2234
- "loss": 0.0024,
2235
- "step": 3230
2236
- },
2237
- {
2238
- "epoch": 4.84,
2239
- "learning_rate": 1.9427860696517413e-05,
2240
- "loss": 0.0023,
2241
- "step": 3240
2242
- },
2243
- {
2244
- "epoch": 4.85,
2245
- "learning_rate": 1.9179104477611943e-05,
2246
- "loss": 0.0022,
2247
- "step": 3250
2248
- },
2249
- {
2250
- "epoch": 4.87,
2251
- "learning_rate": 1.8930348258706466e-05,
2252
- "loss": 0.0025,
2253
- "step": 3260
2254
- },
2255
- {
2256
- "epoch": 4.88,
2257
- "learning_rate": 1.8681592039800996e-05,
2258
- "loss": 0.0028,
2259
- "step": 3270
2260
- },
2261
- {
2262
- "epoch": 4.9,
2263
- "learning_rate": 1.8432835820895523e-05,
2264
- "loss": 0.0022,
2265
- "step": 3280
2266
- },
2267
- {
2268
- "epoch": 4.91,
2269
- "learning_rate": 1.818407960199005e-05,
2270
- "loss": 0.0021,
2271
- "step": 3290
2272
- },
2273
- {
2274
- "epoch": 4.93,
2275
- "learning_rate": 1.793532338308458e-05,
2276
- "loss": 0.0022,
2277
- "step": 3300
2278
- },
2279
- {
2280
- "epoch": 4.93,
2281
- "eval_accuracy": 0.9899244332493703,
2282
- "eval_loss": 0.0504080168902874,
2283
- "eval_runtime": 19.0283,
2284
- "eval_samples_per_second": 62.591,
2285
- "eval_steps_per_second": 7.83,
2286
- "step": 3300
2287
- },
2288
- {
2289
- "epoch": 4.94,
2290
- "learning_rate": 1.7686567164179106e-05,
2291
- "loss": 0.0021,
2292
- "step": 3310
2293
- },
2294
- {
2295
- "epoch": 4.96,
2296
- "learning_rate": 1.7437810945273633e-05,
2297
- "loss": 0.0023,
2298
- "step": 3320
2299
- },
2300
- {
2301
- "epoch": 4.97,
2302
- "learning_rate": 1.718905472636816e-05,
2303
- "loss": 0.0022,
2304
- "step": 3330
2305
- },
2306
- {
2307
- "epoch": 4.99,
2308
- "learning_rate": 1.6940298507462686e-05,
2309
- "loss": 0.0021,
2310
- "step": 3340
2311
- },
2312
- {
2313
- "epoch": 5.0,
2314
- "learning_rate": 1.6691542288557216e-05,
2315
- "loss": 0.002,
2316
- "step": 3350
2317
- },
2318
- {
2319
- "epoch": 5.01,
2320
- "learning_rate": 1.6442786069651742e-05,
2321
- "loss": 0.0021,
2322
- "step": 3360
2323
- },
2324
- {
2325
- "epoch": 5.03,
2326
- "learning_rate": 1.619402985074627e-05,
2327
- "loss": 0.0021,
2328
- "step": 3370
2329
- },
2330
- {
2331
- "epoch": 5.04,
2332
- "learning_rate": 1.5945273631840796e-05,
2333
- "loss": 0.0021,
2334
- "step": 3380
2335
- },
2336
- {
2337
- "epoch": 5.06,
2338
- "learning_rate": 1.5696517412935326e-05,
2339
- "loss": 0.0022,
2340
- "step": 3390
2341
- },
2342
- {
2343
- "epoch": 5.07,
2344
- "learning_rate": 1.5447761194029852e-05,
2345
- "loss": 0.0022,
2346
- "step": 3400
2347
- },
2348
- {
2349
- "epoch": 5.07,
2350
- "eval_accuracy": 0.9890848026868178,
2351
- "eval_loss": 0.048807818442583084,
2352
- "eval_runtime": 20.0305,
2353
- "eval_samples_per_second": 59.459,
2354
- "eval_steps_per_second": 7.439,
2355
- "step": 3400
2356
- },
2357
- {
2358
- "epoch": 5.09,
2359
- "learning_rate": 1.5199004975124379e-05,
2360
- "loss": 0.002,
2361
- "step": 3410
2362
- },
2363
- {
2364
- "epoch": 5.1,
2365
- "learning_rate": 1.4950248756218907e-05,
2366
- "loss": 0.0021,
2367
- "step": 3420
2368
- },
2369
- {
2370
- "epoch": 5.12,
2371
- "learning_rate": 1.4701492537313432e-05,
2372
- "loss": 0.0023,
2373
- "step": 3430
2374
- },
2375
- {
2376
- "epoch": 5.13,
2377
- "learning_rate": 1.4452736318407962e-05,
2378
- "loss": 0.0022,
2379
- "step": 3440
2380
- },
2381
- {
2382
- "epoch": 5.15,
2383
- "learning_rate": 1.4203980099502487e-05,
2384
- "loss": 0.002,
2385
- "step": 3450
2386
- },
2387
- {
2388
- "epoch": 5.16,
2389
- "learning_rate": 1.3955223880597015e-05,
2390
- "loss": 0.002,
2391
- "step": 3460
2392
- },
2393
- {
2394
- "epoch": 5.18,
2395
- "learning_rate": 1.3706467661691543e-05,
2396
- "loss": 0.0022,
2397
- "step": 3470
2398
- },
2399
- {
2400
- "epoch": 5.19,
2401
- "learning_rate": 1.345771144278607e-05,
2402
- "loss": 0.0021,
2403
- "step": 3480
2404
- },
2405
- {
2406
- "epoch": 5.21,
2407
- "learning_rate": 1.3208955223880598e-05,
2408
- "loss": 0.0021,
2409
- "step": 3490
2410
- },
2411
- {
2412
- "epoch": 5.22,
2413
- "learning_rate": 1.2960199004975123e-05,
2414
- "loss": 0.0021,
2415
- "step": 3500
2416
- },
2417
- {
2418
- "epoch": 5.22,
2419
- "eval_accuracy": 0.9899244332493703,
2420
- "eval_loss": 0.04841174930334091,
2421
- "eval_runtime": 19.9366,
2422
- "eval_samples_per_second": 59.739,
2423
- "eval_steps_per_second": 7.474,
2424
- "step": 3500
2425
- },
2426
- {
2427
- "epoch": 5.24,
2428
- "learning_rate": 1.2711442786069651e-05,
2429
- "loss": 0.002,
2430
- "step": 3510
2431
- },
2432
- {
2433
- "epoch": 5.25,
2434
- "learning_rate": 1.246268656716418e-05,
2435
- "loss": 0.002,
2436
- "step": 3520
2437
- },
2438
- {
2439
- "epoch": 5.27,
2440
- "learning_rate": 1.2213930348258706e-05,
2441
- "loss": 0.002,
2442
- "step": 3530
2443
- },
2444
- {
2445
- "epoch": 5.28,
2446
- "learning_rate": 1.1965174129353235e-05,
2447
- "loss": 0.002,
2448
- "step": 3540
2449
- },
2450
- {
2451
- "epoch": 5.3,
2452
- "learning_rate": 1.1716417910447761e-05,
2453
- "loss": 0.002,
2454
- "step": 3550
2455
- },
2456
- {
2457
- "epoch": 5.31,
2458
- "learning_rate": 1.146766169154229e-05,
2459
- "loss": 0.0021,
2460
- "step": 3560
2461
- },
2462
- {
2463
- "epoch": 5.33,
2464
- "learning_rate": 1.1218905472636816e-05,
2465
- "loss": 0.0021,
2466
- "step": 3570
2467
- },
2468
- {
2469
- "epoch": 5.34,
2470
- "learning_rate": 1.0970149253731343e-05,
2471
- "loss": 0.0021,
2472
- "step": 3580
2473
- },
2474
- {
2475
- "epoch": 5.36,
2476
- "learning_rate": 1.0721393034825871e-05,
2477
- "loss": 0.002,
2478
- "step": 3590
2479
- },
2480
- {
2481
- "epoch": 5.37,
2482
- "learning_rate": 1.04726368159204e-05,
2483
- "loss": 0.0019,
2484
- "step": 3600
2485
- },
2486
- {
2487
- "epoch": 5.37,
2488
- "eval_accuracy": 0.9899244332493703,
2489
- "eval_loss": 0.04795178398489952,
2490
- "eval_runtime": 19.4137,
2491
- "eval_samples_per_second": 61.348,
2492
- "eval_steps_per_second": 7.675,
2493
- "step": 3600
2494
- },
2495
- {
2496
- "epoch": 5.39,
2497
- "learning_rate": 1.0223880597014926e-05,
2498
- "loss": 0.002,
2499
- "step": 3610
2500
- },
2501
- {
2502
- "epoch": 5.4,
2503
- "learning_rate": 9.975124378109452e-06,
2504
- "loss": 0.002,
2505
- "step": 3620
2506
- },
2507
- {
2508
- "epoch": 5.42,
2509
- "learning_rate": 9.72636815920398e-06,
2510
- "loss": 0.002,
2511
- "step": 3630
2512
- },
2513
- {
2514
- "epoch": 5.43,
2515
- "learning_rate": 9.477611940298507e-06,
2516
- "loss": 0.002,
2517
- "step": 3640
2518
- },
2519
- {
2520
- "epoch": 5.45,
2521
- "learning_rate": 9.228855721393036e-06,
2522
- "loss": 0.002,
2523
- "step": 3650
2524
- },
2525
- {
2526
- "epoch": 5.46,
2527
- "learning_rate": 8.980099502487562e-06,
2528
- "loss": 0.0019,
2529
- "step": 3660
2530
- },
2531
- {
2532
- "epoch": 5.48,
2533
- "learning_rate": 8.73134328358209e-06,
2534
- "loss": 0.0019,
2535
- "step": 3670
2536
- },
2537
- {
2538
- "epoch": 5.49,
2539
- "learning_rate": 8.482587064676617e-06,
2540
- "loss": 0.0019,
2541
- "step": 3680
2542
- },
2543
- {
2544
- "epoch": 5.51,
2545
- "learning_rate": 8.233830845771144e-06,
2546
- "loss": 0.002,
2547
- "step": 3690
2548
- },
2549
- {
2550
- "epoch": 5.52,
2551
- "learning_rate": 7.985074626865672e-06,
2552
- "loss": 0.0019,
2553
- "step": 3700
2554
- },
2555
- {
2556
- "epoch": 5.52,
2557
- "eval_accuracy": 0.9899244332493703,
2558
- "eval_loss": 0.04765713959932327,
2559
- "eval_runtime": 19.0716,
2560
- "eval_samples_per_second": 62.449,
2561
- "eval_steps_per_second": 7.813,
2562
- "step": 3700
2563
- },
2564
- {
2565
- "epoch": 5.54,
2566
- "learning_rate": 7.7363184079602e-06,
2567
- "loss": 0.002,
2568
- "step": 3710
2569
- },
2570
- {
2571
- "epoch": 5.55,
2572
- "learning_rate": 7.487562189054727e-06,
2573
- "loss": 0.0019,
2574
- "step": 3720
2575
- },
2576
- {
2577
- "epoch": 5.57,
2578
- "learning_rate": 7.238805970149254e-06,
2579
- "loss": 0.002,
2580
- "step": 3730
2581
- },
2582
- {
2583
- "epoch": 5.58,
2584
- "learning_rate": 6.990049751243781e-06,
2585
- "loss": 0.0019,
2586
- "step": 3740
2587
- },
2588
- {
2589
- "epoch": 5.6,
2590
- "learning_rate": 6.741293532338309e-06,
2591
- "loss": 0.0019,
2592
- "step": 3750
2593
- },
2594
- {
2595
- "epoch": 5.61,
2596
- "learning_rate": 6.492537313432837e-06,
2597
- "loss": 0.002,
2598
- "step": 3760
2599
- },
2600
- {
2601
- "epoch": 5.63,
2602
- "learning_rate": 6.243781094527364e-06,
2603
- "loss": 0.002,
2604
- "step": 3770
2605
- },
2606
- {
2607
- "epoch": 5.64,
2608
- "learning_rate": 5.995024875621891e-06,
2609
- "loss": 0.002,
2610
- "step": 3780
2611
- },
2612
- {
2613
- "epoch": 5.66,
2614
- "learning_rate": 5.746268656716418e-06,
2615
- "loss": 0.002,
2616
- "step": 3790
2617
- },
2618
- {
2619
- "epoch": 5.67,
2620
- "learning_rate": 5.4975124378109455e-06,
2621
- "loss": 0.0019,
2622
- "step": 3800
2623
- },
2624
- {
2625
- "epoch": 5.67,
2626
- "eval_accuracy": 0.9899244332493703,
2627
- "eval_loss": 0.04748144745826721,
2628
- "eval_runtime": 19.2405,
2629
- "eval_samples_per_second": 61.901,
2630
- "eval_steps_per_second": 7.744,
2631
- "step": 3800
2632
- },
2633
- {
2634
- "epoch": 5.69,
2635
- "learning_rate": 5.248756218905473e-06,
2636
- "loss": 0.002,
2637
- "step": 3810
2638
- },
2639
- {
2640
- "epoch": 5.7,
2641
- "learning_rate": 5e-06,
2642
- "loss": 0.0019,
2643
- "step": 3820
2644
- },
2645
- {
2646
- "epoch": 5.72,
2647
- "learning_rate": 4.751243781094528e-06,
2648
- "loss": 0.0019,
2649
- "step": 3830
2650
- },
2651
- {
2652
- "epoch": 5.73,
2653
- "learning_rate": 4.5024875621890544e-06,
2654
- "loss": 0.0019,
2655
- "step": 3840
2656
- },
2657
- {
2658
- "epoch": 5.75,
2659
- "learning_rate": 4.253731343283583e-06,
2660
- "loss": 0.0018,
2661
- "step": 3850
2662
- },
2663
- {
2664
- "epoch": 5.76,
2665
- "learning_rate": 4.004975124378109e-06,
2666
- "loss": 0.002,
2667
- "step": 3860
2668
- },
2669
- {
2670
- "epoch": 5.78,
2671
- "learning_rate": 3.7562189054726368e-06,
2672
- "loss": 0.0018,
2673
- "step": 3870
2674
- },
2675
- {
2676
- "epoch": 5.79,
2677
- "learning_rate": 3.5074626865671646e-06,
2678
- "loss": 0.0019,
2679
- "step": 3880
2680
- },
2681
- {
2682
- "epoch": 5.81,
2683
- "learning_rate": 3.2587064676616916e-06,
2684
- "loss": 0.0019,
2685
- "step": 3890
2686
- },
2687
- {
2688
- "epoch": 5.82,
2689
- "learning_rate": 3.009950248756219e-06,
2690
- "loss": 0.0018,
2691
- "step": 3900
2692
- },
2693
- {
2694
- "epoch": 5.82,
2695
- "eval_accuracy": 0.9899244332493703,
2696
- "eval_loss": 0.0473811998963356,
2697
- "eval_runtime": 22.4733,
2698
- "eval_samples_per_second": 52.996,
2699
- "eval_steps_per_second": 6.63,
2700
- "step": 3900
2701
- },
2702
- {
2703
- "epoch": 5.84,
2704
- "learning_rate": 2.7611940298507465e-06,
2705
- "loss": 0.002,
2706
- "step": 3910
2707
- },
2708
- {
2709
- "epoch": 5.85,
2710
- "learning_rate": 2.512437810945274e-06,
2711
- "loss": 0.0018,
2712
- "step": 3920
2713
- },
2714
- {
2715
- "epoch": 5.87,
2716
- "learning_rate": 2.263681592039801e-06,
2717
- "loss": 0.0019,
2718
- "step": 3930
2719
- },
2720
- {
2721
- "epoch": 5.88,
2722
- "learning_rate": 2.0149253731343284e-06,
2723
- "loss": 0.0018,
2724
- "step": 3940
2725
- },
2726
- {
2727
- "epoch": 5.9,
2728
- "learning_rate": 1.7661691542288559e-06,
2729
- "loss": 0.002,
2730
- "step": 3950
2731
- },
2732
- {
2733
- "epoch": 5.91,
2734
- "learning_rate": 1.517412935323383e-06,
2735
- "loss": 0.0019,
2736
- "step": 3960
2737
- },
2738
- {
2739
- "epoch": 5.93,
2740
- "learning_rate": 1.2686567164179105e-06,
2741
- "loss": 0.0019,
2742
- "step": 3970
2743
- },
2744
- {
2745
- "epoch": 5.94,
2746
- "learning_rate": 1.019900497512438e-06,
2747
- "loss": 0.002,
2748
- "step": 3980
2749
- },
2750
- {
2751
- "epoch": 5.96,
2752
- "learning_rate": 7.711442786069652e-07,
2753
- "loss": 0.0019,
2754
- "step": 3990
2755
- },
2756
- {
2757
- "epoch": 5.97,
2758
- "learning_rate": 5.223880597014926e-07,
2759
- "loss": 0.0018,
2760
- "step": 4000
2761
- },
2762
- {
2763
- "epoch": 5.97,
2764
- "eval_accuracy": 0.9899244332493703,
2765
- "eval_loss": 0.047312360256910324,
2766
- "eval_runtime": 19.2838,
2767
- "eval_samples_per_second": 61.762,
2768
- "eval_steps_per_second": 7.727,
2769
- "step": 4000
2770
- },
2771
- {
2772
- "epoch": 5.99,
2773
- "learning_rate": 2.736318407960199e-07,
2774
- "loss": 0.0019,
2775
- "step": 4010
2776
- },
2777
- {
2778
- "epoch": 6.0,
2779
- "learning_rate": 2.4875621890547262e-08,
2780
- "loss": 0.002,
2781
- "step": 4020
2782
- },
2783
- {
2784
- "epoch": 6.0,
2785
- "step": 4020,
2786
- "total_flos": 4.98440050928135e+18,
2787
- "train_loss": 0.10442606876627426,
2788
- "train_runtime": 2908.5673,
2789
- "train_samples_per_second": 22.112,
2790
- "train_steps_per_second": 1.382
2791
  }
2792
  ],
2793
  "logging_steps": 10,
2794
- "max_steps": 4020,
2795
- "num_train_epochs": 6,
2796
  "save_steps": 100,
2797
- "total_flos": 4.98440050928135e+18,
2798
  "trial_name": null,
2799
  "trial_params": null
2800
  }
 
1
  {
2
+ "best_metric": 0.03965451195836067,
3
+ "best_model_checkpoint": "./vit-base-beans-demo-v5/checkpoint-500",
4
+ "epoch": 1.0,
5
  "eval_steps": 100,
6
+ "global_step": 596,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.02,
13
+ "learning_rate": 9.83221476510067e-07,
14
+ "loss": 0.0019,
15
  "step": 10
16
  },
17
  {
18
  "epoch": 0.03,
19
+ "learning_rate": 9.664429530201342e-07,
20
+ "loss": 0.002,
21
  "step": 20
22
  },
23
  {
24
+ "epoch": 0.05,
25
+ "learning_rate": 9.496644295302014e-07,
26
+ "loss": 0.0037,
27
  "step": 30
28
  },
29
  {
30
+ "epoch": 0.07,
31
+ "learning_rate": 9.328859060402684e-07,
32
+ "loss": 0.0039,
33
  "step": 40
34
  },
35
  {
36
+ "epoch": 0.08,
37
+ "learning_rate": 9.161073825503355e-07,
38
+ "loss": 0.0204,
39
  "step": 50
40
  },
41
  {
42
+ "epoch": 0.1,
43
+ "learning_rate": 8.993288590604026e-07,
44
+ "loss": 0.0304,
45
  "step": 60
46
  },
47
  {
48
+ "epoch": 0.12,
49
+ "learning_rate": 8.825503355704698e-07,
50
+ "loss": 0.0142,
51
  "step": 70
52
  },
53
  {
54
+ "epoch": 0.13,
55
+ "learning_rate": 8.657718120805368e-07,
56
+ "loss": 0.038,
57
  "step": 80
58
  },
59
  {
60
+ "epoch": 0.15,
61
+ "learning_rate": 8.48993288590604e-07,
62
+ "loss": 0.0019,
63
  "step": 90
64
  },
65
  {
66
+ "epoch": 0.17,
67
+ "learning_rate": 8.322147651006712e-07,
68
+ "loss": 0.004,
69
  "step": 100
70
  },
71
  {
72
+ "epoch": 0.17,
73
+ "eval_accuracy": 0.9903442485306465,
74
+ "eval_loss": 0.04405502602458,
75
+ "eval_runtime": 57.4827,
76
+ "eval_samples_per_second": 41.439,
77
+ "eval_steps_per_second": 5.184,
78
  "step": 100
79
  },
80
  {
81
+ "epoch": 0.18,
82
+ "learning_rate": 8.154362416107382e-07,
83
+ "loss": 0.0017,
84
  "step": 110
85
  },
86
  {
87
+ "epoch": 0.2,
88
+ "learning_rate": 7.986577181208053e-07,
89
+ "loss": 0.0018,
90
  "step": 120
91
  },
92
  {
93
+ "epoch": 0.22,
94
+ "learning_rate": 7.818791946308725e-07,
95
+ "loss": 0.0017,
96
  "step": 130
97
  },
98
  {
99
+ "epoch": 0.23,
100
+ "learning_rate": 7.651006711409396e-07,
101
+ "loss": 0.0182,
102
  "step": 140
103
  },
104
  {
105
+ "epoch": 0.25,
106
+ "learning_rate": 7.483221476510066e-07,
107
+ "loss": 0.0017,
108
  "step": 150
109
  },
110
  {
111
+ "epoch": 0.27,
112
+ "learning_rate": 7.315436241610739e-07,
113
+ "loss": 0.011,
114
  "step": 160
115
  },
116
  {
117
+ "epoch": 0.29,
118
+ "learning_rate": 7.147651006711409e-07,
119
+ "loss": 0.0018,
120
  "step": 170
121
  },
122
  {
123
+ "epoch": 0.3,
124
+ "learning_rate": 6.97986577181208e-07,
125
+ "loss": 0.0065,
126
  "step": 180
127
  },
128
  {
129
+ "epoch": 0.32,
130
+ "learning_rate": 6.81208053691275e-07,
131
+ "loss": 0.0019,
132
  "step": 190
133
  },
134
  {
135
+ "epoch": 0.34,
136
+ "learning_rate": 6.644295302013423e-07,
137
+ "loss": 0.0025,
138
  "step": 200
139
  },
140
  {
141
+ "epoch": 0.34,
142
+ "eval_accuracy": 0.9903442485306465,
143
+ "eval_loss": 0.04235763102769852,
144
+ "eval_runtime": 58.8296,
145
+ "eval_samples_per_second": 40.49,
146
+ "eval_steps_per_second": 5.065,
147
  "step": 200
148
  },
149
  {
150
+ "epoch": 0.35,
151
+ "learning_rate": 6.476510067114094e-07,
152
+ "loss": 0.0017,
153
  "step": 210
154
  },
155
  {
156
+ "epoch": 0.37,
157
+ "learning_rate": 6.308724832214764e-07,
158
+ "loss": 0.0016,
159
  "step": 220
160
  },
161
  {
162
+ "epoch": 0.39,
163
+ "learning_rate": 6.140939597315436e-07,
164
+ "loss": 0.0069,
165
  "step": 230
166
  },
167
  {
168
+ "epoch": 0.4,
169
+ "learning_rate": 5.973154362416107e-07,
170
+ "loss": 0.005,
171
  "step": 240
172
  },
173
  {
174
+ "epoch": 0.42,
175
+ "learning_rate": 5.805369127516778e-07,
176
+ "loss": 0.0021,
177
  "step": 250
178
  },
179
  {
180
+ "epoch": 0.44,
181
+ "learning_rate": 5.637583892617449e-07,
182
+ "loss": 0.0015,
183
  "step": 260
184
  },
185
  {
186
+ "epoch": 0.45,
187
+ "learning_rate": 5.469798657718121e-07,
188
+ "loss": 0.0114,
189
  "step": 270
190
  },
191
  {
192
+ "epoch": 0.47,
193
+ "learning_rate": 5.302013422818791e-07,
194
+ "loss": 0.0139,
195
  "step": 280
196
  },
197
  {
198
+ "epoch": 0.49,
199
+ "learning_rate": 5.134228187919463e-07,
200
+ "loss": 0.0046,
201
  "step": 290
202
  },
203
  {
204
+ "epoch": 0.5,
205
+ "learning_rate": 4.966442953020134e-07,
206
+ "loss": 0.0019,
207
  "step": 300
208
  },
209
  {
210
+ "epoch": 0.5,
211
+ "eval_accuracy": 0.9907640638119227,
212
+ "eval_loss": 0.04056321829557419,
213
+ "eval_runtime": 58.6218,
214
+ "eval_samples_per_second": 40.633,
215
+ "eval_steps_per_second": 5.083,
216
  "step": 300
217
  },
218
  {
219
+ "epoch": 0.52,
220
+ "learning_rate": 4.798657718120805e-07,
221
+ "loss": 0.002,
222
  "step": 310
223
  },
224
  {
225
+ "epoch": 0.54,
226
+ "learning_rate": 4.630872483221476e-07,
227
+ "loss": 0.0034,
228
  "step": 320
229
  },
230
  {
231
+ "epoch": 0.55,
232
+ "learning_rate": 4.4630872483221475e-07,
233
+ "loss": 0.0092,
234
  "step": 330
235
  },
236
  {
237
+ "epoch": 0.57,
238
+ "learning_rate": 4.2953020134228183e-07,
239
+ "loss": 0.0021,
240
  "step": 340
241
  },
242
  {
243
+ "epoch": 0.59,
244
+ "learning_rate": 4.12751677852349e-07,
245
+ "loss": 0.0015,
246
  "step": 350
247
  },
248
  {
249
+ "epoch": 0.6,
250
+ "learning_rate": 3.959731543624161e-07,
251
+ "loss": 0.0383,
252
  "step": 360
253
  },
254
  {
255
+ "epoch": 0.62,
256
+ "learning_rate": 3.791946308724832e-07,
257
+ "loss": 0.0292,
258
  "step": 370
259
  },
260
  {
261
+ "epoch": 0.64,
262
+ "learning_rate": 3.624161073825503e-07,
263
+ "loss": 0.0072,
264
  "step": 380
265
  },
266
  {
267
+ "epoch": 0.65,
268
+ "learning_rate": 3.4563758389261744e-07,
269
+ "loss": 0.0278,
270
  "step": 390
271
  },
272
  {
273
+ "epoch": 0.67,
274
+ "learning_rate": 3.2885906040268457e-07,
275
+ "loss": 0.0015,
276
  "step": 400
277
  },
278
  {
279
+ "epoch": 0.67,
280
+ "eval_accuracy": 0.9916036943744753,
281
+ "eval_loss": 0.039836905896663666,
282
+ "eval_runtime": 58.6815,
283
+ "eval_samples_per_second": 40.592,
284
+ "eval_steps_per_second": 5.078,
285
  "step": 400
286
  },
287
  {
288
+ "epoch": 0.69,
289
+ "learning_rate": 3.1208053691275165e-07,
290
+ "loss": 0.0025,
291
  "step": 410
292
  },
293
  {
294
+ "epoch": 0.7,
295
+ "learning_rate": 2.953020134228188e-07,
296
+ "loss": 0.0036,
297
  "step": 420
298
  },
299
  {
300
+ "epoch": 0.72,
301
+ "learning_rate": 2.7852348993288586e-07,
302
+ "loss": 0.0014,
303
  "step": 430
304
  },
305
  {
306
+ "epoch": 0.74,
307
+ "learning_rate": 2.61744966442953e-07,
308
+ "loss": 0.0042,
309
  "step": 440
310
  },
311
  {
312
+ "epoch": 0.76,
313
+ "learning_rate": 2.4496644295302013e-07,
314
+ "loss": 0.0013,
315
  "step": 450
316
  },
317
  {
318
+ "epoch": 0.77,
319
+ "learning_rate": 2.2818791946308723e-07,
320
+ "loss": 0.0043,
321
  "step": 460
322
  },
323
  {
324
+ "epoch": 0.79,
325
+ "learning_rate": 2.1140939597315434e-07,
326
+ "loss": 0.0034,
327
  "step": 470
328
  },
329
  {
330
+ "epoch": 0.81,
331
+ "learning_rate": 1.9463087248322147e-07,
332
+ "loss": 0.0022,
333
  "step": 480
334
  },
335
  {
336
+ "epoch": 0.82,
337
+ "learning_rate": 1.7785234899328858e-07,
338
+ "loss": 0.0035,
339
  "step": 490
340
  },
341
  {
342
+ "epoch": 0.84,
343
+ "learning_rate": 1.6107382550335569e-07,
344
+ "loss": 0.0019,
345
  "step": 500
346
  },
347
  {
348
+ "epoch": 0.84,
349
+ "eval_accuracy": 0.9916036943744753,
350
+ "eval_loss": 0.03965451195836067,
351
+ "eval_runtime": 59.706,
352
+ "eval_samples_per_second": 39.895,
353
+ "eval_steps_per_second": 4.991,
354
  "step": 500
355
  },
356
  {
357
+ "epoch": 0.86,
358
+ "learning_rate": 1.4429530201342282e-07,
359
+ "loss": 0.0018,
360
  "step": 510
361
  },
362
  {
363
+ "epoch": 0.87,
364
+ "learning_rate": 1.2751677852348992e-07,
365
+ "loss": 0.0085,
366
  "step": 520
367
  },
368
  {
369
+ "epoch": 0.89,
370
+ "learning_rate": 1.1073825503355704e-07,
371
+ "loss": 0.0015,
372
  "step": 530
373
  },
374
  {
375
+ "epoch": 0.91,
376
+ "learning_rate": 9.395973154362416e-08,
377
+ "loss": 0.0016,
378
  "step": 540
379
  },
380
  {
381
+ "epoch": 0.92,
382
+ "learning_rate": 7.718120805369127e-08,
383
+ "loss": 0.0019,
384
  "step": 550
385
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
386
  {
387
  "epoch": 0.94,
388
+ "learning_rate": 6.040268456375839e-08,
389
+ "loss": 0.0036,
390
+ "step": 560
391
  },
392
  {
393
  "epoch": 0.96,
394
+ "learning_rate": 4.36241610738255e-08,
395
+ "loss": 0.0084,
396
+ "step": 570
397
  },
398
  {
399
  "epoch": 0.97,
400
+ "learning_rate": 2.6845637583892614e-08,
401
+ "loss": 0.0015,
402
+ "step": 580
403
  },
404
  {
405
  "epoch": 0.99,
406
+ "learning_rate": 1.006711409395973e-08,
407
+ "loss": 0.0017,
408
+ "step": 590
409
  },
410
  {
411
  "epoch": 1.0,
412
+ "step": 596,
413
+ "total_flos": 2.610159532500861e+18,
414
+ "train_loss": 0.006745032241080431,
415
+ "train_runtime": 1079.8985,
416
+ "train_samples_per_second": 8.823,
417
+ "train_steps_per_second": 0.552
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418
  }
419
  ],
420
  "logging_steps": 10,
421
+ "max_steps": 596,
422
+ "num_train_epochs": 1,
423
  "save_steps": 100,
424
+ "total_flos": 2.610159532500861e+18,
425
  "trial_name": null,
426
  "trial_params": null
427
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a8e42c45dfffff4a80369d5d9aad05ac2e76b9cd8f1d5e29ff05f2d9e3f5bb3b
3
  size 4027
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d75607155d243ac4ff36eeb85ef438bb769099e7b24b50eb811267dd330e538a
3
  size 4027