sanchit-gandhi HF staff commited on
Commit
c2d03f8
1 Parent(s): 00e2480

End of training

Browse files
README.md CHANGED
@@ -4,7 +4,7 @@ base_model: distil-whisper/distil-large-v3
4
  tags:
5
  - generated_from_trainer
6
  datasets:
7
- - common_voice_16_1
8
  metrics:
9
  - wer
10
  model-index:
@@ -14,8 +14,8 @@ model-index:
14
  name: Automatic Speech Recognition
15
  type: automatic-speech-recognition
16
  dataset:
17
- name: common_voice_16_1
18
- type: common_voice_16_1
19
  config: hi
20
  split: test
21
  args: hi
@@ -30,7 +30,7 @@ should probably proofread and complete it, then remove this comment. -->
30
 
31
  # distil-whisper/distil-large-v3
32
 
33
- This model is a fine-tuned version of [distil-whisper/distil-large-v3](https://huggingface.co/distil-whisper/distil-large-v3) on the common_voice_16_1 dataset.
34
  It achieves the following results on the evaluation set:
35
  - Loss: 0.3749
36
  - Wer: 0.2664
 
4
  tags:
5
  - generated_from_trainer
6
  datasets:
7
+ - mozilla-foundation/common_voice_16_1
8
  metrics:
9
  - wer
10
  model-index:
 
14
  name: Automatic Speech Recognition
15
  type: automatic-speech-recognition
16
  dataset:
17
+ name: mozilla-foundation/common_voice_16_1 hi
18
+ type: mozilla-foundation/common_voice_16_1
19
  config: hi
20
  split: test
21
  args: hi
 
30
 
31
  # distil-whisper/distil-large-v3
32
 
33
+ This model is a fine-tuned version of [distil-whisper/distil-large-v3](https://huggingface.co/distil-whisper/distil-large-v3) on the mozilla-foundation/common_voice_16_1 hi dataset.
34
  It achieves the following results on the evaluation set:
35
  - Loss: 0.3749
36
  - Wer: 0.2664
all_results.json CHANGED
@@ -1,8 +1,14 @@
1
  {
2
  "epoch": 22.52,
3
- "train_loss": 0.17524469082718716,
4
- "train_runtime": 15083.6622,
 
 
 
 
 
 
5
  "train_samples": 7099,
6
- "train_samples_per_second": 10.608,
7
- "train_steps_per_second": 0.331
8
  }
 
1
  {
2
  "epoch": 22.52,
3
+ "eval_loss": 0.37487614154815674,
4
+ "eval_runtime": 1345.0244,
5
+ "eval_samples": 3123,
6
+ "eval_samples_per_second": 2.322,
7
+ "eval_steps_per_second": 0.073,
8
+ "eval_wer": 0.26639882562002626,
9
+ "train_loss": 0.1750582966186106,
10
+ "train_runtime": 15499.9794,
11
  "train_samples": 7099,
12
+ "train_samples_per_second": 10.323,
13
+ "train_steps_per_second": 0.323
14
  }
eval_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 22.52,
3
+ "eval_loss": 0.37487614154815674,
4
+ "eval_runtime": 1345.0244,
5
+ "eval_samples": 3123,
6
+ "eval_samples_per_second": 2.322,
7
+ "eval_steps_per_second": 0.073,
8
+ "eval_wer": 0.26639882562002626
9
+ }
runs/Mar27_19-04-58_hf-dgx-01/events.out.tfevents.1711579662.hf-dgx-01.1894903.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d61f0e8201847fd3d90c3dcb9e37b4a3ce389e21a4f5c2a051e7fd7def8bcd93
3
+ size 406
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 22.52,
3
- "train_loss": 0.17524469082718716,
4
- "train_runtime": 15083.6622,
5
  "train_samples": 7099,
6
- "train_samples_per_second": 10.608,
7
- "train_steps_per_second": 0.331
8
  }
 
1
  {
2
  "epoch": 22.52,
3
+ "train_loss": 0.1750582966186106,
4
+ "train_runtime": 15499.9794,
5
  "train_samples": 7099,
6
+ "train_samples_per_second": 10.323,
7
+ "train_steps_per_second": 0.323
8
  }
trainer_state.json CHANGED
@@ -10,1457 +10,1457 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.11,
13
- "grad_norm": 29.428333282470703,
14
  "learning_rate": 4.000000000000001e-06,
15
  "loss": 11.9112,
16
  "step": 25
17
  },
18
  {
19
  "epoch": 0.23,
20
- "grad_norm": 12.572431564331055,
21
  "learning_rate": 9e-06,
22
- "loss": 5.9607,
23
  "step": 50
24
  },
25
  {
26
  "epoch": 0.34,
27
- "grad_norm": 6.247668743133545,
28
  "learning_rate": 1.4000000000000001e-05,
29
  "loss": 2.7899,
30
  "step": 75
31
  },
32
  {
33
  "epoch": 0.45,
34
- "grad_norm": 5.499792098999023,
35
  "learning_rate": 1.9e-05,
36
- "loss": 1.934,
37
  "step": 100
38
  },
39
  {
40
  "epoch": 0.56,
41
- "grad_norm": 10.862707138061523,
42
  "learning_rate": 2.4e-05,
43
- "loss": 1.1845,
44
  "step": 125
45
  },
46
  {
47
  "epoch": 0.68,
48
- "grad_norm": 6.8538055419921875,
49
  "learning_rate": 2.9e-05,
50
- "loss": 0.7883,
51
  "step": 150
52
  },
53
  {
54
  "epoch": 0.79,
55
- "grad_norm": 8.127602577209473,
56
  "learning_rate": 3.4000000000000007e-05,
57
- "loss": 0.6147,
58
  "step": 175
59
  },
60
  {
61
  "epoch": 0.9,
62
- "grad_norm": 4.003240585327148,
63
  "learning_rate": 3.9000000000000006e-05,
64
- "loss": 0.5233,
65
  "step": 200
66
  },
67
  {
68
  "epoch": 1.01,
69
- "grad_norm": 3.650707483291626,
70
  "learning_rate": 4.4000000000000006e-05,
71
  "loss": 0.453,
72
  "step": 225
73
  },
74
  {
75
  "epoch": 1.13,
76
- "grad_norm": 4.5928239822387695,
77
  "learning_rate": 4.9e-05,
78
  "loss": 0.3913,
79
  "step": 250
80
  },
81
  {
82
  "epoch": 1.24,
83
- "grad_norm": 4.008325576782227,
84
  "learning_rate": 5.4000000000000005e-05,
85
  "loss": 0.3729,
86
  "step": 275
87
  },
88
  {
89
  "epoch": 1.35,
90
- "grad_norm": 4.239988327026367,
91
  "learning_rate": 5.9e-05,
92
  "loss": 0.3544,
93
  "step": 300
94
  },
95
  {
96
  "epoch": 1.46,
97
- "grad_norm": 3.8822410106658936,
98
  "learning_rate": 6.400000000000001e-05,
99
- "loss": 0.3229,
100
  "step": 325
101
  },
102
  {
103
  "epoch": 1.58,
104
- "grad_norm": 3.0306766033172607,
105
  "learning_rate": 6.9e-05,
106
- "loss": 0.3357,
107
  "step": 350
108
  },
109
  {
110
  "epoch": 1.69,
111
- "grad_norm": 2.7435803413391113,
112
  "learning_rate": 7.4e-05,
113
- "loss": 0.3148,
114
  "step": 375
115
  },
116
  {
117
  "epoch": 1.8,
118
- "grad_norm": 3.684567928314209,
119
  "learning_rate": 7.900000000000001e-05,
120
- "loss": 0.2912,
121
  "step": 400
122
  },
123
  {
124
  "epoch": 1.91,
125
- "grad_norm": 2.486985206604004,
126
  "learning_rate": 8.4e-05,
127
- "loss": 0.3058,
128
  "step": 425
129
  },
130
  {
131
  "epoch": 2.03,
132
- "grad_norm": 2.5083959102630615,
133
  "learning_rate": 8.900000000000001e-05,
134
- "loss": 0.2651,
135
  "step": 450
136
  },
137
  {
138
  "epoch": 2.14,
139
- "grad_norm": 4.557464599609375,
140
  "learning_rate": 9.4e-05,
141
- "loss": 0.2339,
142
  "step": 475
143
  },
144
  {
145
  "epoch": 2.25,
146
- "grad_norm": 3.3180325031280518,
147
  "learning_rate": 9.900000000000001e-05,
148
- "loss": 0.2337,
149
  "step": 500
150
  },
151
  {
152
  "epoch": 2.36,
153
- "grad_norm": 2.496147632598877,
154
  "learning_rate": 9.955555555555556e-05,
155
- "loss": 0.2372,
156
  "step": 525
157
  },
158
  {
159
  "epoch": 2.48,
160
- "grad_norm": 2.2330338954925537,
161
  "learning_rate": 9.900000000000001e-05,
162
- "loss": 0.2219,
163
  "step": 550
164
  },
165
  {
166
  "epoch": 2.59,
167
- "grad_norm": 3.0495846271514893,
168
  "learning_rate": 9.844444444444444e-05,
169
- "loss": 0.2323,
170
  "step": 575
171
  },
172
  {
173
  "epoch": 2.7,
174
- "grad_norm": 2.3662843704223633,
175
  "learning_rate": 9.78888888888889e-05,
176
- "loss": 0.2324,
177
  "step": 600
178
  },
179
  {
180
  "epoch": 2.82,
181
- "grad_norm": 1.981231451034546,
182
  "learning_rate": 9.733333333333335e-05,
183
- "loss": 0.2088,
184
  "step": 625
185
  },
186
  {
187
  "epoch": 2.93,
188
- "grad_norm": 2.484710454940796,
189
  "learning_rate": 9.677777777777778e-05,
190
- "loss": 0.2195,
191
  "step": 650
192
  },
193
  {
194
  "epoch": 3.04,
195
- "grad_norm": 1.7488161325454712,
196
  "learning_rate": 9.622222222222222e-05,
197
- "loss": 0.1868,
198
  "step": 675
199
  },
200
  {
201
  "epoch": 3.15,
202
- "grad_norm": 2.266071081161499,
203
  "learning_rate": 9.566666666666667e-05,
204
- "loss": 0.1537,
205
  "step": 700
206
  },
207
  {
208
  "epoch": 3.27,
209
- "grad_norm": 1.6045178174972534,
210
  "learning_rate": 9.511111111111112e-05,
211
- "loss": 0.157,
212
  "step": 725
213
  },
214
  {
215
  "epoch": 3.38,
216
- "grad_norm": 1.8283653259277344,
217
  "learning_rate": 9.455555555555556e-05,
218
- "loss": 0.1516,
219
  "step": 750
220
  },
221
  {
222
  "epoch": 3.49,
223
- "grad_norm": 2.1718389987945557,
224
  "learning_rate": 9.4e-05,
225
- "loss": 0.1657,
226
  "step": 775
227
  },
228
  {
229
  "epoch": 3.6,
230
- "grad_norm": 2.778785467147827,
231
  "learning_rate": 9.344444444444444e-05,
232
- "loss": 0.1529,
233
  "step": 800
234
  },
235
  {
236
  "epoch": 3.72,
237
- "grad_norm": 2.0423874855041504,
238
  "learning_rate": 9.28888888888889e-05,
239
- "loss": 0.153,
240
  "step": 825
241
  },
242
  {
243
  "epoch": 3.83,
244
- "grad_norm": 1.7835185527801514,
245
  "learning_rate": 9.233333333333333e-05,
246
- "loss": 0.1514,
247
  "step": 850
248
  },
249
  {
250
  "epoch": 3.94,
251
- "grad_norm": 2.091015100479126,
252
  "learning_rate": 9.177777777777778e-05,
253
- "loss": 0.151,
254
  "step": 875
255
  },
256
  {
257
  "epoch": 4.05,
258
- "grad_norm": 1.47210693359375,
259
  "learning_rate": 9.122222222222223e-05,
260
- "loss": 0.1248,
261
  "step": 900
262
  },
263
  {
264
  "epoch": 4.17,
265
- "grad_norm": 1.5700939893722534,
266
  "learning_rate": 9.066666666666667e-05,
267
- "loss": 0.0955,
268
  "step": 925
269
  },
270
  {
271
  "epoch": 4.28,
272
- "grad_norm": 1.0798161029815674,
273
  "learning_rate": 9.011111111111111e-05,
274
- "loss": 0.0965,
275
  "step": 950
276
  },
277
  {
278
  "epoch": 4.39,
279
- "grad_norm": 1.250017523765564,
280
  "learning_rate": 8.955555555555556e-05,
281
- "loss": 0.1029,
282
  "step": 975
283
  },
284
  {
285
  "epoch": 4.5,
286
- "grad_norm": 1.3333516120910645,
287
  "learning_rate": 8.900000000000001e-05,
288
- "loss": 0.1015,
289
  "step": 1000
290
  },
291
  {
292
  "epoch": 4.5,
293
- "eval_loss": 0.3065292239189148,
294
- "eval_runtime": 1302.6648,
295
- "eval_samples_per_second": 2.397,
296
- "eval_steps_per_second": 0.075,
297
- "eval_wer": 0.3243838368229931,
298
  "step": 1000
299
  },
300
  {
301
  "epoch": 4.62,
302
- "grad_norm": 2.2534544467926025,
303
  "learning_rate": 8.844444444444445e-05,
304
- "loss": 0.1098,
305
  "step": 1025
306
  },
307
  {
308
  "epoch": 4.73,
309
- "grad_norm": 1.6706323623657227,
310
  "learning_rate": 8.78888888888889e-05,
311
- "loss": 0.1066,
312
  "step": 1050
313
  },
314
  {
315
  "epoch": 4.84,
316
- "grad_norm": 1.9353983402252197,
317
  "learning_rate": 8.733333333333333e-05,
318
- "loss": 0.1033,
319
  "step": 1075
320
  },
321
  {
322
  "epoch": 4.95,
323
- "grad_norm": 1.833392858505249,
324
  "learning_rate": 8.677777777777778e-05,
325
- "loss": 0.1041,
326
  "step": 1100
327
  },
328
  {
329
  "epoch": 5.07,
330
- "grad_norm": 1.094043254852295,
331
  "learning_rate": 8.622222222222222e-05,
332
- "loss": 0.0782,
333
  "step": 1125
334
  },
335
  {
336
  "epoch": 5.18,
337
- "grad_norm": 1.6280676126480103,
338
  "learning_rate": 8.566666666666667e-05,
339
- "loss": 0.0604,
340
  "step": 1150
341
  },
342
  {
343
  "epoch": 5.29,
344
- "grad_norm": 1.2326525449752808,
345
  "learning_rate": 8.511111111111112e-05,
346
- "loss": 0.0665,
347
  "step": 1175
348
  },
349
  {
350
  "epoch": 5.41,
351
- "grad_norm": 1.186036467552185,
352
  "learning_rate": 8.455555555555556e-05,
353
- "loss": 0.0679,
354
  "step": 1200
355
  },
356
  {
357
  "epoch": 5.52,
358
- "grad_norm": 1.3472570180892944,
359
  "learning_rate": 8.4e-05,
360
- "loss": 0.0656,
361
  "step": 1225
362
  },
363
  {
364
  "epoch": 5.63,
365
- "grad_norm": 2.1403074264526367,
366
  "learning_rate": 8.344444444444445e-05,
367
- "loss": 0.0674,
368
  "step": 1250
369
  },
370
  {
371
  "epoch": 5.74,
372
- "grad_norm": 1.0580947399139404,
373
  "learning_rate": 8.28888888888889e-05,
374
- "loss": 0.0713,
375
  "step": 1275
376
  },
377
  {
378
  "epoch": 5.86,
379
- "grad_norm": 1.0808650255203247,
380
  "learning_rate": 8.233333333333333e-05,
381
- "loss": 0.0713,
382
  "step": 1300
383
  },
384
  {
385
  "epoch": 5.97,
386
- "grad_norm": 1.0721344947814941,
387
  "learning_rate": 8.177777777777778e-05,
388
- "loss": 0.0707,
389
  "step": 1325
390
  },
391
  {
392
  "epoch": 6.08,
393
- "grad_norm": 1.7433174848556519,
394
  "learning_rate": 8.122222222222222e-05,
395
- "loss": 0.0492,
396
  "step": 1350
397
  },
398
  {
399
  "epoch": 6.19,
400
- "grad_norm": 0.9549305438995361,
401
  "learning_rate": 8.066666666666667e-05,
402
- "loss": 0.0418,
403
  "step": 1375
404
  },
405
  {
406
  "epoch": 6.31,
407
- "grad_norm": 1.4030609130859375,
408
  "learning_rate": 8.011111111111111e-05,
409
- "loss": 0.0382,
410
  "step": 1400
411
  },
412
  {
413
  "epoch": 6.42,
414
- "grad_norm": 0.9085283279418945,
415
  "learning_rate": 7.955555555555556e-05,
416
- "loss": 0.0369,
417
  "step": 1425
418
  },
419
  {
420
  "epoch": 6.53,
421
- "grad_norm": 1.0393314361572266,
422
  "learning_rate": 7.900000000000001e-05,
423
- "loss": 0.0403,
424
  "step": 1450
425
  },
426
  {
427
  "epoch": 6.64,
428
- "grad_norm": 0.675774872303009,
429
  "learning_rate": 7.844444444444446e-05,
430
- "loss": 0.0414,
431
  "step": 1475
432
  },
433
  {
434
  "epoch": 6.76,
435
- "grad_norm": 0.8051535487174988,
436
  "learning_rate": 7.788888888888888e-05,
437
- "loss": 0.0426,
438
  "step": 1500
439
  },
440
  {
441
  "epoch": 6.87,
442
- "grad_norm": 1.4626388549804688,
443
  "learning_rate": 7.733333333333333e-05,
444
- "loss": 0.0436,
445
  "step": 1525
446
  },
447
  {
448
  "epoch": 6.98,
449
- "grad_norm": 0.8418045043945312,
450
  "learning_rate": 7.677777777777778e-05,
451
- "loss": 0.0442,
452
  "step": 1550
453
  },
454
  {
455
  "epoch": 7.09,
456
- "grad_norm": 1.3747352361679077,
457
  "learning_rate": 7.622222222222223e-05,
458
- "loss": 0.0281,
459
  "step": 1575
460
  },
461
  {
462
  "epoch": 7.21,
463
- "grad_norm": 0.5290963649749756,
464
  "learning_rate": 7.566666666666667e-05,
465
- "loss": 0.0237,
466
  "step": 1600
467
  },
468
  {
469
  "epoch": 7.32,
470
- "grad_norm": 1.2137552499771118,
471
  "learning_rate": 7.511111111111111e-05,
472
- "loss": 0.0249,
473
  "step": 1625
474
  },
475
  {
476
  "epoch": 7.43,
477
- "grad_norm": 0.7687398791313171,
478
  "learning_rate": 7.455555555555556e-05,
479
- "loss": 0.0261,
480
  "step": 1650
481
  },
482
  {
483
  "epoch": 7.55,
484
- "grad_norm": 1.1545344591140747,
485
  "learning_rate": 7.4e-05,
486
- "loss": 0.0249,
487
  "step": 1675
488
  },
489
  {
490
  "epoch": 7.66,
491
- "grad_norm": 0.7673143148422241,
492
  "learning_rate": 7.344444444444445e-05,
493
- "loss": 0.0248,
494
  "step": 1700
495
  },
496
  {
497
  "epoch": 7.77,
498
- "grad_norm": 0.9905190467834473,
499
  "learning_rate": 7.28888888888889e-05,
500
- "loss": 0.0254,
501
  "step": 1725
502
  },
503
  {
504
  "epoch": 7.88,
505
- "grad_norm": 1.764397382736206,
506
  "learning_rate": 7.233333333333335e-05,
507
- "loss": 0.0297,
508
  "step": 1750
509
  },
510
  {
511
  "epoch": 8.0,
512
- "grad_norm": 0.9069448709487915,
513
  "learning_rate": 7.177777777777777e-05,
514
- "loss": 0.0275,
515
  "step": 1775
516
  },
517
  {
518
  "epoch": 8.11,
519
- "grad_norm": 1.1385760307312012,
520
  "learning_rate": 7.122222222222222e-05,
521
- "loss": 0.0162,
522
  "step": 1800
523
  },
524
  {
525
  "epoch": 8.22,
526
- "grad_norm": 0.5694571733474731,
527
  "learning_rate": 7.066666666666667e-05,
528
- "loss": 0.0149,
529
  "step": 1825
530
  },
531
  {
532
  "epoch": 8.33,
533
- "grad_norm": 1.0839495658874512,
534
  "learning_rate": 7.011111111111112e-05,
535
- "loss": 0.0175,
536
  "step": 1850
537
  },
538
  {
539
  "epoch": 8.45,
540
- "grad_norm": 0.7086426019668579,
541
  "learning_rate": 6.955555555555556e-05,
542
- "loss": 0.0189,
543
  "step": 1875
544
  },
545
  {
546
  "epoch": 8.56,
547
- "grad_norm": 0.9548362493515015,
548
  "learning_rate": 6.9e-05,
549
- "loss": 0.0193,
550
  "step": 1900
551
  },
552
  {
553
  "epoch": 8.67,
554
- "grad_norm": 0.9621508717536926,
555
  "learning_rate": 6.844444444444445e-05,
556
- "loss": 0.0186,
557
  "step": 1925
558
  },
559
  {
560
  "epoch": 8.78,
561
- "grad_norm": 0.6629220843315125,
562
  "learning_rate": 6.788888888888888e-05,
563
- "loss": 0.0171,
564
  "step": 1950
565
  },
566
  {
567
  "epoch": 8.9,
568
- "grad_norm": 0.7981088161468506,
569
  "learning_rate": 6.733333333333333e-05,
570
- "loss": 0.0175,
571
  "step": 1975
572
  },
573
  {
574
  "epoch": 9.01,
575
- "grad_norm": 0.45495709776878357,
576
  "learning_rate": 6.677777777777779e-05,
577
- "loss": 0.0167,
578
  "step": 2000
579
  },
580
  {
581
  "epoch": 9.01,
582
- "eval_loss": 0.3443203866481781,
583
- "eval_runtime": 1269.8219,
584
- "eval_samples_per_second": 2.459,
585
- "eval_steps_per_second": 0.077,
586
- "eval_wer": 0.2994668933013984,
587
  "step": 2000
588
  },
589
  {
590
  "epoch": 9.12,
591
- "grad_norm": 1.0250108242034912,
592
  "learning_rate": 6.622222222222224e-05,
593
  "loss": 0.0124,
594
  "step": 2025
595
  },
596
  {
597
  "epoch": 9.23,
598
- "grad_norm": 0.533909022808075,
599
  "learning_rate": 6.566666666666666e-05,
600
- "loss": 0.0128,
601
  "step": 2050
602
  },
603
  {
604
  "epoch": 9.35,
605
- "grad_norm": 0.5022910237312317,
606
  "learning_rate": 6.511111111111111e-05,
607
- "loss": 0.0127,
608
  "step": 2075
609
  },
610
  {
611
  "epoch": 9.46,
612
- "grad_norm": 1.3371328115463257,
613
  "learning_rate": 6.455555555555556e-05,
614
- "loss": 0.0116,
615
  "step": 2100
616
  },
617
  {
618
  "epoch": 9.57,
619
- "grad_norm": 1.2396471500396729,
620
  "learning_rate": 6.400000000000001e-05,
621
- "loss": 0.0112,
622
  "step": 2125
623
  },
624
  {
625
  "epoch": 9.68,
626
- "grad_norm": 1.2121708393096924,
627
  "learning_rate": 6.344444444444445e-05,
628
- "loss": 0.0107,
629
  "step": 2150
630
  },
631
  {
632
  "epoch": 9.8,
633
- "grad_norm": 1.3228121995925903,
634
  "learning_rate": 6.28888888888889e-05,
635
- "loss": 0.0108,
636
  "step": 2175
637
  },
638
  {
639
  "epoch": 9.91,
640
- "grad_norm": 0.6204155683517456,
641
  "learning_rate": 6.233333333333334e-05,
642
- "loss": 0.0123,
643
  "step": 2200
644
  },
645
  {
646
  "epoch": 10.02,
647
- "grad_norm": 0.4221612811088562,
648
  "learning_rate": 6.177777777777779e-05,
649
- "loss": 0.0117,
650
  "step": 2225
651
  },
652
  {
653
  "epoch": 10.14,
654
- "grad_norm": 0.8225328922271729,
655
  "learning_rate": 6.122222222222222e-05,
656
- "loss": 0.008,
657
  "step": 2250
658
  },
659
  {
660
  "epoch": 10.25,
661
- "grad_norm": 0.22648921608924866,
662
  "learning_rate": 6.066666666666667e-05,
663
- "loss": 0.0075,
664
  "step": 2275
665
  },
666
  {
667
  "epoch": 10.36,
668
- "grad_norm": 1.0620574951171875,
669
  "learning_rate": 6.011111111111112e-05,
670
- "loss": 0.0077,
671
  "step": 2300
672
  },
673
  {
674
  "epoch": 10.47,
675
- "grad_norm": 0.5009572505950928,
676
  "learning_rate": 5.9555555555555554e-05,
677
- "loss": 0.008,
678
  "step": 2325
679
  },
680
  {
681
  "epoch": 10.59,
682
- "grad_norm": 0.6466513872146606,
683
  "learning_rate": 5.9e-05,
684
- "loss": 0.0098,
685
  "step": 2350
686
  },
687
  {
688
  "epoch": 10.7,
689
- "grad_norm": 0.2255641371011734,
690
  "learning_rate": 5.844444444444445e-05,
691
- "loss": 0.0094,
692
  "step": 2375
693
  },
694
  {
695
  "epoch": 10.81,
696
- "grad_norm": 0.838545560836792,
697
  "learning_rate": 5.788888888888889e-05,
698
- "loss": 0.0089,
699
  "step": 2400
700
  },
701
  {
702
  "epoch": 10.92,
703
- "grad_norm": 0.6793853044509888,
704
  "learning_rate": 5.7333333333333336e-05,
705
- "loss": 0.0087,
706
  "step": 2425
707
  },
708
  {
709
  "epoch": 11.04,
710
- "grad_norm": 0.548841655254364,
711
  "learning_rate": 5.6777777777777786e-05,
712
- "loss": 0.0069,
713
  "step": 2450
714
  },
715
  {
716
  "epoch": 11.15,
717
- "grad_norm": 0.22741466760635376,
718
  "learning_rate": 5.622222222222222e-05,
719
- "loss": 0.0065,
720
  "step": 2475
721
  },
722
  {
723
  "epoch": 11.26,
724
- "grad_norm": 0.4155316650867462,
725
  "learning_rate": 5.566666666666667e-05,
726
- "loss": 0.0058,
727
  "step": 2500
728
  },
729
  {
730
  "epoch": 11.37,
731
- "grad_norm": 0.48344260454177856,
732
  "learning_rate": 5.511111111111111e-05,
733
- "loss": 0.005,
734
  "step": 2525
735
  },
736
  {
737
  "epoch": 11.49,
738
- "grad_norm": 0.9006750583648682,
739
  "learning_rate": 5.455555555555556e-05,
740
- "loss": 0.0045,
741
  "step": 2550
742
  },
743
  {
744
  "epoch": 11.6,
745
- "grad_norm": 0.9966240525245667,
746
  "learning_rate": 5.4000000000000005e-05,
747
- "loss": 0.0047,
748
  "step": 2575
749
  },
750
  {
751
  "epoch": 11.71,
752
- "grad_norm": 0.39858147501945496,
753
  "learning_rate": 5.3444444444444455e-05,
754
- "loss": 0.0053,
755
  "step": 2600
756
  },
757
  {
758
  "epoch": 11.82,
759
- "grad_norm": 0.6118489503860474,
760
  "learning_rate": 5.2888888888888885e-05,
761
- "loss": 0.0053,
762
  "step": 2625
763
  },
764
  {
765
  "epoch": 11.94,
766
- "grad_norm": 0.5074841976165771,
767
  "learning_rate": 5.2333333333333336e-05,
768
- "loss": 0.0057,
769
  "step": 2650
770
  },
771
  {
772
  "epoch": 12.05,
773
- "grad_norm": 0.6888458728790283,
774
  "learning_rate": 5.177777777777778e-05,
775
- "loss": 0.0053,
776
  "step": 2675
777
  },
778
  {
779
  "epoch": 12.16,
780
- "grad_norm": 0.7311161160469055,
781
  "learning_rate": 5.122222222222223e-05,
782
- "loss": 0.006,
783
  "step": 2700
784
  },
785
  {
786
  "epoch": 12.27,
787
- "grad_norm": 0.47264620661735535,
788
  "learning_rate": 5.0666666666666674e-05,
789
- "loss": 0.0058,
790
  "step": 2725
791
  },
792
  {
793
  "epoch": 12.39,
794
- "grad_norm": 0.6639235019683838,
795
  "learning_rate": 5.011111111111111e-05,
796
- "loss": 0.0052,
797
  "step": 2750
798
  },
799
  {
800
  "epoch": 12.5,
801
- "grad_norm": 0.1161256805062294,
802
  "learning_rate": 4.955555555555556e-05,
803
- "loss": 0.0038,
804
  "step": 2775
805
  },
806
  {
807
  "epoch": 12.61,
808
- "grad_norm": 0.4923400580883026,
809
  "learning_rate": 4.9e-05,
810
- "loss": 0.0036,
811
  "step": 2800
812
  },
813
  {
814
  "epoch": 12.73,
815
- "grad_norm": 0.6149506568908691,
816
  "learning_rate": 4.844444444444445e-05,
817
- "loss": 0.0046,
818
  "step": 2825
819
  },
820
  {
821
  "epoch": 12.84,
822
- "grad_norm": 0.16888651251792908,
823
  "learning_rate": 4.7888888888888886e-05,
824
- "loss": 0.0041,
825
  "step": 2850
826
  },
827
  {
828
  "epoch": 12.95,
829
- "grad_norm": 1.0652014017105103,
830
  "learning_rate": 4.7333333333333336e-05,
831
- "loss": 0.0041,
832
  "step": 2875
833
  },
834
  {
835
  "epoch": 13.06,
836
- "grad_norm": 0.21759897470474243,
837
  "learning_rate": 4.677777777777778e-05,
838
- "loss": 0.003,
839
  "step": 2900
840
  },
841
  {
842
  "epoch": 13.18,
843
- "grad_norm": 0.23394200205802917,
844
  "learning_rate": 4.6222222222222224e-05,
845
- "loss": 0.0034,
846
  "step": 2925
847
  },
848
  {
849
  "epoch": 13.29,
850
- "grad_norm": 0.05768038332462311,
851
  "learning_rate": 4.566666666666667e-05,
852
- "loss": 0.0037,
853
  "step": 2950
854
  },
855
  {
856
  "epoch": 13.4,
857
- "grad_norm": 0.08611828088760376,
858
  "learning_rate": 4.511111111111112e-05,
859
- "loss": 0.0034,
860
  "step": 2975
861
  },
862
  {
863
  "epoch": 13.51,
864
- "grad_norm": 0.1028035581111908,
865
  "learning_rate": 4.4555555555555555e-05,
866
- "loss": 0.0032,
867
  "step": 3000
868
  },
869
  {
870
  "epoch": 13.51,
871
- "eval_loss": 0.3575945198535919,
872
- "eval_runtime": 1297.8448,
873
- "eval_samples_per_second": 2.406,
874
- "eval_steps_per_second": 0.076,
875
- "eval_wer": 0.27779494707563934,
876
  "step": 3000
877
  },
878
  {
879
  "epoch": 13.63,
880
- "grad_norm": 0.23182912170886993,
881
  "learning_rate": 4.4000000000000006e-05,
882
- "loss": 0.0027,
883
  "step": 3025
884
  },
885
  {
886
  "epoch": 13.74,
887
- "grad_norm": 0.100206658244133,
888
  "learning_rate": 4.344444444444445e-05,
889
- "loss": 0.0027,
890
  "step": 3050
891
  },
892
  {
893
  "epoch": 13.85,
894
- "grad_norm": 0.9118719100952148,
895
  "learning_rate": 4.2888888888888886e-05,
896
- "loss": 0.003,
897
  "step": 3075
898
  },
899
  {
900
  "epoch": 13.96,
901
- "grad_norm": 0.06793611496686935,
902
  "learning_rate": 4.233333333333334e-05,
903
- "loss": 0.003,
904
  "step": 3100
905
  },
906
  {
907
  "epoch": 14.08,
908
- "grad_norm": 0.0683990940451622,
909
  "learning_rate": 4.177777777777778e-05,
910
- "loss": 0.0021,
911
  "step": 3125
912
  },
913
  {
914
  "epoch": 14.19,
915
- "grad_norm": 0.19087089598178864,
916
  "learning_rate": 4.1222222222222224e-05,
917
- "loss": 0.0028,
918
  "step": 3150
919
  },
920
  {
921
  "epoch": 14.3,
922
- "grad_norm": 0.14526407420635223,
923
  "learning_rate": 4.066666666666667e-05,
924
- "loss": 0.0025,
925
  "step": 3175
926
  },
927
  {
928
  "epoch": 14.41,
929
- "grad_norm": 0.5902572870254517,
930
  "learning_rate": 4.011111111111111e-05,
931
- "loss": 0.0031,
932
  "step": 3200
933
  },
934
  {
935
  "epoch": 14.53,
936
- "grad_norm": 0.1988796442747116,
937
  "learning_rate": 3.9555555555555556e-05,
938
- "loss": 0.0021,
939
  "step": 3225
940
  },
941
  {
942
  "epoch": 14.64,
943
- "grad_norm": 0.178738534450531,
944
  "learning_rate": 3.9000000000000006e-05,
945
- "loss": 0.0031,
946
  "step": 3250
947
  },
948
  {
949
  "epoch": 14.75,
950
- "grad_norm": 0.03732344135642052,
951
  "learning_rate": 3.844444444444444e-05,
952
- "loss": 0.0026,
953
  "step": 3275
954
  },
955
  {
956
  "epoch": 14.86,
957
- "grad_norm": 0.047354888170957565,
958
  "learning_rate": 3.7888888888888894e-05,
959
- "loss": 0.0016,
960
  "step": 3300
961
  },
962
  {
963
  "epoch": 14.98,
964
- "grad_norm": 0.058274924755096436,
965
  "learning_rate": 3.733333333333334e-05,
966
- "loss": 0.0019,
967
  "step": 3325
968
  },
969
  {
970
  "epoch": 15.09,
971
- "grad_norm": 1.4180477857589722,
972
  "learning_rate": 3.677777777777778e-05,
973
- "loss": 0.0016,
974
  "step": 3350
975
  },
976
  {
977
  "epoch": 15.2,
978
- "grad_norm": 0.03281378000974655,
979
  "learning_rate": 3.6222222222222225e-05,
980
- "loss": 0.0016,
981
  "step": 3375
982
  },
983
  {
984
  "epoch": 15.32,
985
- "grad_norm": 0.2159404158592224,
986
  "learning_rate": 3.566666666666667e-05,
987
- "loss": 0.0026,
988
  "step": 3400
989
  },
990
  {
991
  "epoch": 15.43,
992
- "grad_norm": 0.18890638649463654,
993
  "learning_rate": 3.511111111111111e-05,
994
- "loss": 0.0016,
995
  "step": 3425
996
  },
997
  {
998
  "epoch": 15.54,
999
- "grad_norm": 0.022921651601791382,
1000
  "learning_rate": 3.4555555555555556e-05,
1001
- "loss": 0.0012,
1002
  "step": 3450
1003
  },
1004
  {
1005
  "epoch": 15.65,
1006
- "grad_norm": 0.02838265895843506,
1007
  "learning_rate": 3.4000000000000007e-05,
1008
- "loss": 0.0014,
1009
  "step": 3475
1010
  },
1011
  {
1012
  "epoch": 15.77,
1013
- "grad_norm": 0.04957688972353935,
1014
  "learning_rate": 3.3444444444444443e-05,
1015
- "loss": 0.0012,
1016
  "step": 3500
1017
  },
1018
  {
1019
  "epoch": 15.88,
1020
- "grad_norm": 0.03910296410322189,
1021
  "learning_rate": 3.2888888888888894e-05,
1022
- "loss": 0.0008,
1023
  "step": 3525
1024
  },
1025
  {
1026
  "epoch": 15.99,
1027
- "grad_norm": 0.3031899034976959,
1028
  "learning_rate": 3.233333333333333e-05,
1029
- "loss": 0.0015,
1030
  "step": 3550
1031
  },
1032
  {
1033
  "epoch": 16.1,
1034
- "grad_norm": 0.026370937004685402,
1035
  "learning_rate": 3.177777777777778e-05,
1036
- "loss": 0.0009,
1037
  "step": 3575
1038
  },
1039
  {
1040
  "epoch": 16.22,
1041
- "grad_norm": 0.04645024240016937,
1042
  "learning_rate": 3.1222222222222225e-05,
1043
- "loss": 0.0014,
1044
  "step": 3600
1045
  },
1046
  {
1047
  "epoch": 16.33,
1048
- "grad_norm": 0.03346904739737511,
1049
  "learning_rate": 3.066666666666667e-05,
1050
- "loss": 0.001,
1051
  "step": 3625
1052
  },
1053
  {
1054
  "epoch": 16.44,
1055
- "grad_norm": 0.41791531443595886,
1056
  "learning_rate": 3.0111111111111113e-05,
1057
- "loss": 0.0019,
1058
  "step": 3650
1059
  },
1060
  {
1061
  "epoch": 16.55,
1062
- "grad_norm": 0.023621816188097,
1063
  "learning_rate": 2.955555555555556e-05,
1064
- "loss": 0.0009,
1065
  "step": 3675
1066
  },
1067
  {
1068
  "epoch": 16.67,
1069
- "grad_norm": 0.020701350644230843,
1070
  "learning_rate": 2.9e-05,
1071
- "loss": 0.0009,
1072
  "step": 3700
1073
  },
1074
  {
1075
  "epoch": 16.78,
1076
- "grad_norm": 0.018095409497618675,
1077
  "learning_rate": 2.8444444444444447e-05,
1078
  "loss": 0.0007,
1079
  "step": 3725
1080
  },
1081
  {
1082
  "epoch": 16.89,
1083
- "grad_norm": 0.03800148516893387,
1084
  "learning_rate": 2.788888888888889e-05,
1085
- "loss": 0.001,
1086
  "step": 3750
1087
  },
1088
  {
1089
  "epoch": 17.0,
1090
- "grad_norm": 0.0219491608440876,
1091
  "learning_rate": 2.733333333333333e-05,
1092
- "loss": 0.0012,
1093
  "step": 3775
1094
  },
1095
  {
1096
  "epoch": 17.12,
1097
- "grad_norm": 0.19971542060375214,
1098
  "learning_rate": 2.677777777777778e-05,
1099
- "loss": 0.001,
1100
  "step": 3800
1101
  },
1102
  {
1103
  "epoch": 17.23,
1104
- "grad_norm": 0.022324278950691223,
1105
  "learning_rate": 2.6222222222222226e-05,
1106
- "loss": 0.0005,
1107
  "step": 3825
1108
  },
1109
  {
1110
  "epoch": 17.34,
1111
- "grad_norm": 0.014598184265196323,
1112
  "learning_rate": 2.5666666666666666e-05,
1113
- "loss": 0.0007,
1114
  "step": 3850
1115
  },
1116
  {
1117
  "epoch": 17.45,
1118
- "grad_norm": 0.01482320111244917,
1119
  "learning_rate": 2.5111111111111113e-05,
1120
- "loss": 0.0008,
1121
  "step": 3875
1122
  },
1123
  {
1124
  "epoch": 17.57,
1125
- "grad_norm": 0.019341906532645226,
1126
  "learning_rate": 2.4555555555555557e-05,
1127
- "loss": 0.0005,
1128
  "step": 3900
1129
  },
1130
  {
1131
  "epoch": 17.68,
1132
- "grad_norm": 0.044308606535196304,
1133
  "learning_rate": 2.4e-05,
1134
- "loss": 0.0008,
1135
  "step": 3925
1136
  },
1137
  {
1138
  "epoch": 17.79,
1139
- "grad_norm": 0.01700867898762226,
1140
  "learning_rate": 2.3444444444444448e-05,
1141
- "loss": 0.0009,
1142
  "step": 3950
1143
  },
1144
  {
1145
  "epoch": 17.91,
1146
- "grad_norm": 0.01428561843931675,
1147
  "learning_rate": 2.288888888888889e-05,
1148
- "loss": 0.0004,
1149
  "step": 3975
1150
  },
1151
  {
1152
  "epoch": 18.02,
1153
- "grad_norm": 0.011909844353795052,
1154
  "learning_rate": 2.2333333333333335e-05,
1155
- "loss": 0.0004,
1156
  "step": 4000
1157
  },
1158
  {
1159
  "epoch": 18.02,
1160
- "eval_loss": 0.36695417761802673,
1161
- "eval_runtime": 1296.9402,
1162
- "eval_samples_per_second": 2.408,
1163
- "eval_steps_per_second": 0.076,
1164
- "eval_wer": 0.2677122769064359,
1165
  "step": 4000
1166
  },
1167
  {
1168
  "epoch": 18.13,
1169
- "grad_norm": 0.011953528970479965,
1170
  "learning_rate": 2.177777777777778e-05,
1171
- "loss": 0.0004,
1172
  "step": 4025
1173
  },
1174
  {
1175
  "epoch": 18.24,
1176
- "grad_norm": 0.013035556301474571,
1177
  "learning_rate": 2.1222222222222223e-05,
1178
- "loss": 0.0005,
1179
  "step": 4050
1180
  },
1181
  {
1182
  "epoch": 18.36,
1183
- "grad_norm": 0.011018014512956142,
1184
  "learning_rate": 2.0666666666666666e-05,
1185
- "loss": 0.0003,
1186
  "step": 4075
1187
  },
1188
  {
1189
  "epoch": 18.47,
1190
- "grad_norm": 0.011594709008932114,
1191
  "learning_rate": 2.011111111111111e-05,
1192
- "loss": 0.0004,
1193
  "step": 4100
1194
  },
1195
  {
1196
  "epoch": 18.58,
1197
- "grad_norm": 0.01165748666971922,
1198
  "learning_rate": 1.9555555555555557e-05,
1199
- "loss": 0.0003,
1200
  "step": 4125
1201
  },
1202
  {
1203
  "epoch": 18.69,
1204
- "grad_norm": 0.012751756235957146,
1205
  "learning_rate": 1.9e-05,
1206
  "loss": 0.0003,
1207
  "step": 4150
1208
  },
1209
  {
1210
  "epoch": 18.81,
1211
- "grad_norm": 0.01092427410185337,
1212
  "learning_rate": 1.8444444444444445e-05,
1213
- "loss": 0.0003,
1214
  "step": 4175
1215
  },
1216
  {
1217
  "epoch": 18.92,
1218
- "grad_norm": 0.010369419120252132,
1219
  "learning_rate": 1.788888888888889e-05,
1220
- "loss": 0.0007,
1221
  "step": 4200
1222
  },
1223
  {
1224
  "epoch": 19.03,
1225
- "grad_norm": 0.009451022371649742,
1226
  "learning_rate": 1.7333333333333336e-05,
1227
  "loss": 0.0004,
1228
  "step": 4225
1229
  },
1230
  {
1231
  "epoch": 19.14,
1232
- "grad_norm": 0.010264468379318714,
1233
  "learning_rate": 1.677777777777778e-05,
1234
  "loss": 0.0003,
1235
  "step": 4250
1236
  },
1237
  {
1238
  "epoch": 19.26,
1239
- "grad_norm": 0.009353878907859325,
1240
  "learning_rate": 1.6222222222222223e-05,
1241
  "loss": 0.0003,
1242
  "step": 4275
1243
  },
1244
  {
1245
  "epoch": 19.37,
1246
- "grad_norm": 0.007795905694365501,
1247
  "learning_rate": 1.5666666666666667e-05,
1248
  "loss": 0.0003,
1249
  "step": 4300
1250
  },
1251
  {
1252
  "epoch": 19.48,
1253
- "grad_norm": 0.009554468095302582,
1254
  "learning_rate": 1.5111111111111112e-05,
1255
- "loss": 0.0004,
1256
  "step": 4325
1257
  },
1258
  {
1259
  "epoch": 19.59,
1260
- "grad_norm": 0.009386077523231506,
1261
  "learning_rate": 1.4555555555555556e-05,
1262
  "loss": 0.0003,
1263
  "step": 4350
1264
  },
1265
  {
1266
  "epoch": 19.71,
1267
- "grad_norm": 0.007565716747194529,
1268
  "learning_rate": 1.4000000000000001e-05,
1269
- "loss": 0.0004,
1270
  "step": 4375
1271
  },
1272
  {
1273
  "epoch": 19.82,
1274
- "grad_norm": 0.011739292182028294,
1275
  "learning_rate": 1.3444444444444445e-05,
1276
  "loss": 0.0003,
1277
  "step": 4400
1278
  },
1279
  {
1280
  "epoch": 19.93,
1281
- "grad_norm": 0.011955379508435726,
1282
  "learning_rate": 1.2888888888888889e-05,
1283
- "loss": 0.0004,
1284
  "step": 4425
1285
  },
1286
  {
1287
  "epoch": 20.05,
1288
- "grad_norm": 0.007369581609964371,
1289
  "learning_rate": 1.2333333333333334e-05,
1290
  "loss": 0.0004,
1291
  "step": 4450
1292
  },
1293
  {
1294
  "epoch": 20.16,
1295
- "grad_norm": 0.010209435597062111,
1296
  "learning_rate": 1.1777777777777778e-05,
1297
  "loss": 0.0003,
1298
  "step": 4475
1299
  },
1300
  {
1301
  "epoch": 20.27,
1302
- "grad_norm": 0.009368482045829296,
1303
  "learning_rate": 1.1222222222222224e-05,
1304
  "loss": 0.0003,
1305
  "step": 4500
1306
  },
1307
  {
1308
  "epoch": 20.38,
1309
- "grad_norm": 0.008915912359952927,
1310
  "learning_rate": 1.0666666666666667e-05,
1311
  "loss": 0.0003,
1312
  "step": 4525
1313
  },
1314
  {
1315
  "epoch": 20.5,
1316
- "grad_norm": 0.01048735436052084,
1317
  "learning_rate": 1.0111111111111111e-05,
1318
  "loss": 0.0003,
1319
  "step": 4550
1320
  },
1321
  {
1322
  "epoch": 20.61,
1323
- "grad_norm": 0.010569226928055286,
1324
  "learning_rate": 9.555555555555556e-06,
1325
  "loss": 0.0003,
1326
  "step": 4575
1327
  },
1328
  {
1329
  "epoch": 20.72,
1330
- "grad_norm": 0.008401792496442795,
1331
  "learning_rate": 9e-06,
1332
- "loss": 0.0003,
1333
  "step": 4600
1334
  },
1335
  {
1336
  "epoch": 20.83,
1337
- "grad_norm": 0.01062182616442442,
1338
  "learning_rate": 8.444444444444446e-06,
1339
  "loss": 0.0003,
1340
  "step": 4625
1341
  },
1342
  {
1343
  "epoch": 20.95,
1344
- "grad_norm": 0.007442856673151255,
1345
  "learning_rate": 7.88888888888889e-06,
1346
- "loss": 0.0004,
1347
  "step": 4650
1348
  },
1349
  {
1350
  "epoch": 21.06,
1351
- "grad_norm": 0.007747430354356766,
1352
  "learning_rate": 7.333333333333334e-06,
1353
  "loss": 0.0003,
1354
  "step": 4675
1355
  },
1356
  {
1357
  "epoch": 21.17,
1358
- "grad_norm": 0.008953329175710678,
1359
  "learning_rate": 6.777777777777779e-06,
1360
  "loss": 0.0003,
1361
  "step": 4700
1362
  },
1363
  {
1364
  "epoch": 21.28,
1365
- "grad_norm": 0.0087329912930727,
1366
  "learning_rate": 6.222222222222222e-06,
1367
  "loss": 0.0003,
1368
  "step": 4725
1369
  },
1370
  {
1371
  "epoch": 21.4,
1372
- "grad_norm": 0.007937785238027573,
1373
  "learning_rate": 5.666666666666667e-06,
1374
  "loss": 0.0003,
1375
  "step": 4750
1376
  },
1377
  {
1378
  "epoch": 21.51,
1379
- "grad_norm": 0.007708992809057236,
1380
  "learning_rate": 5.1111111111111115e-06,
1381
  "loss": 0.0003,
1382
  "step": 4775
1383
  },
1384
  {
1385
  "epoch": 21.62,
1386
- "grad_norm": 0.011778591200709343,
1387
  "learning_rate": 4.555555555555556e-06,
1388
  "loss": 0.0003,
1389
  "step": 4800
1390
  },
1391
  {
1392
  "epoch": 21.73,
1393
- "grad_norm": 0.00828944519162178,
1394
  "learning_rate": 4.000000000000001e-06,
1395
- "loss": 0.0002,
1396
  "step": 4825
1397
  },
1398
  {
1399
  "epoch": 21.85,
1400
- "grad_norm": 0.007438404019922018,
1401
  "learning_rate": 3.4444444444444444e-06,
1402
  "loss": 0.0003,
1403
  "step": 4850
1404
  },
1405
  {
1406
  "epoch": 21.96,
1407
- "grad_norm": 0.007443991024047136,
1408
  "learning_rate": 2.888888888888889e-06,
1409
  "loss": 0.0003,
1410
  "step": 4875
1411
  },
1412
  {
1413
  "epoch": 22.07,
1414
- "grad_norm": 0.008769960142672062,
1415
  "learning_rate": 2.3333333333333336e-06,
1416
  "loss": 0.0003,
1417
  "step": 4900
1418
  },
1419
  {
1420
  "epoch": 22.18,
1421
- "grad_norm": 0.008519369177520275,
1422
  "learning_rate": 1.777777777777778e-06,
1423
  "loss": 0.0003,
1424
  "step": 4925
1425
  },
1426
  {
1427
  "epoch": 22.3,
1428
- "grad_norm": 0.007310151122510433,
1429
  "learning_rate": 1.2222222222222223e-06,
1430
  "loss": 0.0002,
1431
  "step": 4950
1432
  },
1433
  {
1434
  "epoch": 22.41,
1435
- "grad_norm": 0.0072664907202124596,
1436
  "learning_rate": 6.666666666666667e-07,
1437
  "loss": 0.0002,
1438
  "step": 4975
1439
  },
1440
  {
1441
  "epoch": 22.52,
1442
- "grad_norm": 0.00765978591516614,
1443
  "learning_rate": 1.1111111111111112e-07,
1444
  "loss": 0.0003,
1445
  "step": 5000
1446
  },
1447
  {
1448
  "epoch": 22.52,
1449
- "eval_loss": 0.3728739619255066,
1450
- "eval_runtime": 1237.2295,
1451
- "eval_samples_per_second": 2.524,
1452
- "eval_steps_per_second": 0.079,
1453
- "eval_wer": 0.2660897782585181,
1454
  "step": 5000
1455
  },
1456
  {
1457
  "epoch": 22.52,
1458
  "step": 5000,
1459
  "total_flos": 2.532745423355904e+20,
1460
- "train_loss": 0.17524469082718716,
1461
- "train_runtime": 15083.6622,
1462
- "train_samples_per_second": 10.608,
1463
- "train_steps_per_second": 0.331
1464
  }
1465
  ],
1466
  "logging_steps": 25,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.11,
13
+ "grad_norm": 29.43235206604004,
14
  "learning_rate": 4.000000000000001e-06,
15
  "loss": 11.9112,
16
  "step": 25
17
  },
18
  {
19
  "epoch": 0.23,
20
+ "grad_norm": 12.577169418334961,
21
  "learning_rate": 9e-06,
22
+ "loss": 5.9608,
23
  "step": 50
24
  },
25
  {
26
  "epoch": 0.34,
27
+ "grad_norm": 6.2483415603637695,
28
  "learning_rate": 1.4000000000000001e-05,
29
  "loss": 2.7899,
30
  "step": 75
31
  },
32
  {
33
  "epoch": 0.45,
34
+ "grad_norm": 5.498162746429443,
35
  "learning_rate": 1.9e-05,
36
+ "loss": 1.9344,
37
  "step": 100
38
  },
39
  {
40
  "epoch": 0.56,
41
+ "grad_norm": 10.876449584960938,
42
  "learning_rate": 2.4e-05,
43
+ "loss": 1.1848,
44
  "step": 125
45
  },
46
  {
47
  "epoch": 0.68,
48
+ "grad_norm": 6.835580825805664,
49
  "learning_rate": 2.9e-05,
50
+ "loss": 0.7884,
51
  "step": 150
52
  },
53
  {
54
  "epoch": 0.79,
55
+ "grad_norm": 8.131637573242188,
56
  "learning_rate": 3.4000000000000007e-05,
57
+ "loss": 0.6148,
58
  "step": 175
59
  },
60
  {
61
  "epoch": 0.9,
62
+ "grad_norm": 3.9987149238586426,
63
  "learning_rate": 3.9000000000000006e-05,
64
+ "loss": 0.5234,
65
  "step": 200
66
  },
67
  {
68
  "epoch": 1.01,
69
+ "grad_norm": 3.6429152488708496,
70
  "learning_rate": 4.4000000000000006e-05,
71
  "loss": 0.453,
72
  "step": 225
73
  },
74
  {
75
  "epoch": 1.13,
76
+ "grad_norm": 4.594738483428955,
77
  "learning_rate": 4.9e-05,
78
  "loss": 0.3913,
79
  "step": 250
80
  },
81
  {
82
  "epoch": 1.24,
83
+ "grad_norm": 4.02340841293335,
84
  "learning_rate": 5.4000000000000005e-05,
85
  "loss": 0.3729,
86
  "step": 275
87
  },
88
  {
89
  "epoch": 1.35,
90
+ "grad_norm": 4.27403450012207,
91
  "learning_rate": 5.9e-05,
92
  "loss": 0.3544,
93
  "step": 300
94
  },
95
  {
96
  "epoch": 1.46,
97
+ "grad_norm": 3.860103130340576,
98
  "learning_rate": 6.400000000000001e-05,
99
+ "loss": 0.3225,
100
  "step": 325
101
  },
102
  {
103
  "epoch": 1.58,
104
+ "grad_norm": 2.948971748352051,
105
  "learning_rate": 6.9e-05,
106
+ "loss": 0.3358,
107
  "step": 350
108
  },
109
  {
110
  "epoch": 1.69,
111
+ "grad_norm": 2.851400852203369,
112
  "learning_rate": 7.4e-05,
113
+ "loss": 0.3143,
114
  "step": 375
115
  },
116
  {
117
  "epoch": 1.8,
118
+ "grad_norm": 3.7811708450317383,
119
  "learning_rate": 7.900000000000001e-05,
120
+ "loss": 0.2908,
121
  "step": 400
122
  },
123
  {
124
  "epoch": 1.91,
125
+ "grad_norm": 2.640065908432007,
126
  "learning_rate": 8.4e-05,
127
+ "loss": 0.3066,
128
  "step": 425
129
  },
130
  {
131
  "epoch": 2.03,
132
+ "grad_norm": 2.669788122177124,
133
  "learning_rate": 8.900000000000001e-05,
134
+ "loss": 0.2659,
135
  "step": 450
136
  },
137
  {
138
  "epoch": 2.14,
139
+ "grad_norm": 3.3060202598571777,
140
  "learning_rate": 9.4e-05,
141
+ "loss": 0.2312,
142
  "step": 475
143
  },
144
  {
145
  "epoch": 2.25,
146
+ "grad_norm": 2.385368824005127,
147
  "learning_rate": 9.900000000000001e-05,
148
+ "loss": 0.2275,
149
  "step": 500
150
  },
151
  {
152
  "epoch": 2.36,
153
+ "grad_norm": 2.557762861251831,
154
  "learning_rate": 9.955555555555556e-05,
155
+ "loss": 0.2424,
156
  "step": 525
157
  },
158
  {
159
  "epoch": 2.48,
160
+ "grad_norm": 2.552363872528076,
161
  "learning_rate": 9.900000000000001e-05,
162
+ "loss": 0.2194,
163
  "step": 550
164
  },
165
  {
166
  "epoch": 2.59,
167
+ "grad_norm": 2.041868209838867,
168
  "learning_rate": 9.844444444444444e-05,
169
+ "loss": 0.235,
170
  "step": 575
171
  },
172
  {
173
  "epoch": 2.7,
174
+ "grad_norm": 2.3052070140838623,
175
  "learning_rate": 9.78888888888889e-05,
176
+ "loss": 0.227,
177
  "step": 600
178
  },
179
  {
180
  "epoch": 2.82,
181
+ "grad_norm": 2.061685562133789,
182
  "learning_rate": 9.733333333333335e-05,
183
+ "loss": 0.2054,
184
  "step": 625
185
  },
186
  {
187
  "epoch": 2.93,
188
+ "grad_norm": 2.4055511951446533,
189
  "learning_rate": 9.677777777777778e-05,
190
+ "loss": 0.218,
191
  "step": 650
192
  },
193
  {
194
  "epoch": 3.04,
195
+ "grad_norm": 1.5952019691467285,
196
  "learning_rate": 9.622222222222222e-05,
197
+ "loss": 0.1879,
198
  "step": 675
199
  },
200
  {
201
  "epoch": 3.15,
202
+ "grad_norm": 1.5567409992218018,
203
  "learning_rate": 9.566666666666667e-05,
204
+ "loss": 0.1524,
205
  "step": 700
206
  },
207
  {
208
  "epoch": 3.27,
209
+ "grad_norm": 2.305778741836548,
210
  "learning_rate": 9.511111111111112e-05,
211
+ "loss": 0.1553,
212
  "step": 725
213
  },
214
  {
215
  "epoch": 3.38,
216
+ "grad_norm": 1.9750478267669678,
217
  "learning_rate": 9.455555555555556e-05,
218
+ "loss": 0.1538,
219
  "step": 750
220
  },
221
  {
222
  "epoch": 3.49,
223
+ "grad_norm": 2.064730405807495,
224
  "learning_rate": 9.4e-05,
225
+ "loss": 0.158,
226
  "step": 775
227
  },
228
  {
229
  "epoch": 3.6,
230
+ "grad_norm": 2.288181781768799,
231
  "learning_rate": 9.344444444444444e-05,
232
+ "loss": 0.1488,
233
  "step": 800
234
  },
235
  {
236
  "epoch": 3.72,
237
+ "grad_norm": 1.7010929584503174,
238
  "learning_rate": 9.28888888888889e-05,
239
+ "loss": 0.1491,
240
  "step": 825
241
  },
242
  {
243
  "epoch": 3.83,
244
+ "grad_norm": 1.653943657875061,
245
  "learning_rate": 9.233333333333333e-05,
246
+ "loss": 0.149,
247
  "step": 850
248
  },
249
  {
250
  "epoch": 3.94,
251
+ "grad_norm": 1.655405044555664,
252
  "learning_rate": 9.177777777777778e-05,
253
+ "loss": 0.1505,
254
  "step": 875
255
  },
256
  {
257
  "epoch": 4.05,
258
+ "grad_norm": 1.766348958015442,
259
  "learning_rate": 9.122222222222223e-05,
260
+ "loss": 0.122,
261
  "step": 900
262
  },
263
  {
264
  "epoch": 4.17,
265
+ "grad_norm": 1.1792188882827759,
266
  "learning_rate": 9.066666666666667e-05,
267
+ "loss": 0.0985,
268
  "step": 925
269
  },
270
  {
271
  "epoch": 4.28,
272
+ "grad_norm": 1.3778753280639648,
273
  "learning_rate": 9.011111111111111e-05,
274
+ "loss": 0.0994,
275
  "step": 950
276
  },
277
  {
278
  "epoch": 4.39,
279
+ "grad_norm": 1.5239909887313843,
280
  "learning_rate": 8.955555555555556e-05,
281
+ "loss": 0.1067,
282
  "step": 975
283
  },
284
  {
285
  "epoch": 4.5,
286
+ "grad_norm": 1.2479132413864136,
287
  "learning_rate": 8.900000000000001e-05,
288
+ "loss": 0.1035,
289
  "step": 1000
290
  },
291
  {
292
  "epoch": 4.5,
293
+ "eval_loss": 0.30151915550231934,
294
+ "eval_runtime": 1360.3971,
295
+ "eval_samples_per_second": 2.296,
296
+ "eval_steps_per_second": 0.072,
297
+ "eval_wer": 0.3249633006258209,
298
  "step": 1000
299
  },
300
  {
301
  "epoch": 4.62,
302
+ "grad_norm": 1.8175023794174194,
303
  "learning_rate": 8.844444444444445e-05,
304
+ "loss": 0.1114,
305
  "step": 1025
306
  },
307
  {
308
  "epoch": 4.73,
309
+ "grad_norm": 1.7170511484146118,
310
  "learning_rate": 8.78888888888889e-05,
311
+ "loss": 0.1059,
312
  "step": 1050
313
  },
314
  {
315
  "epoch": 4.84,
316
+ "grad_norm": 1.6984485387802124,
317
  "learning_rate": 8.733333333333333e-05,
318
+ "loss": 0.1057,
319
  "step": 1075
320
  },
321
  {
322
  "epoch": 4.95,
323
+ "grad_norm": 2.2961227893829346,
324
  "learning_rate": 8.677777777777778e-05,
325
+ "loss": 0.1008,
326
  "step": 1100
327
  },
328
  {
329
  "epoch": 5.07,
330
+ "grad_norm": 1.120532751083374,
331
  "learning_rate": 8.622222222222222e-05,
332
+ "loss": 0.0785,
333
  "step": 1125
334
  },
335
  {
336
  "epoch": 5.18,
337
+ "grad_norm": 2.4177873134613037,
338
  "learning_rate": 8.566666666666667e-05,
339
+ "loss": 0.0587,
340
  "step": 1150
341
  },
342
  {
343
  "epoch": 5.29,
344
+ "grad_norm": 1.216162085533142,
345
  "learning_rate": 8.511111111111112e-05,
346
+ "loss": 0.0645,
347
  "step": 1175
348
  },
349
  {
350
  "epoch": 5.41,
351
+ "grad_norm": 1.3237918615341187,
352
  "learning_rate": 8.455555555555556e-05,
353
+ "loss": 0.0667,
354
  "step": 1200
355
  },
356
  {
357
  "epoch": 5.52,
358
+ "grad_norm": 1.7567414045333862,
359
  "learning_rate": 8.4e-05,
360
+ "loss": 0.0681,
361
  "step": 1225
362
  },
363
  {
364
  "epoch": 5.63,
365
+ "grad_norm": 1.6587105989456177,
366
  "learning_rate": 8.344444444444445e-05,
367
+ "loss": 0.0687,
368
  "step": 1250
369
  },
370
  {
371
  "epoch": 5.74,
372
+ "grad_norm": 1.6716225147247314,
373
  "learning_rate": 8.28888888888889e-05,
374
+ "loss": 0.0697,
375
  "step": 1275
376
  },
377
  {
378
  "epoch": 5.86,
379
+ "grad_norm": 1.4438738822937012,
380
  "learning_rate": 8.233333333333333e-05,
381
+ "loss": 0.0714,
382
  "step": 1300
383
  },
384
  {
385
  "epoch": 5.97,
386
+ "grad_norm": 1.4982130527496338,
387
  "learning_rate": 8.177777777777778e-05,
388
+ "loss": 0.0713,
389
  "step": 1325
390
  },
391
  {
392
  "epoch": 6.08,
393
+ "grad_norm": 1.739702820777893,
394
  "learning_rate": 8.122222222222222e-05,
395
+ "loss": 0.0457,
396
  "step": 1350
397
  },
398
  {
399
  "epoch": 6.19,
400
+ "grad_norm": 0.8361156582832336,
401
  "learning_rate": 8.066666666666667e-05,
402
+ "loss": 0.0379,
403
  "step": 1375
404
  },
405
  {
406
  "epoch": 6.31,
407
+ "grad_norm": 0.9192413091659546,
408
  "learning_rate": 8.011111111111111e-05,
409
+ "loss": 0.0437,
410
  "step": 1400
411
  },
412
  {
413
  "epoch": 6.42,
414
+ "grad_norm": 0.7922126054763794,
415
  "learning_rate": 7.955555555555556e-05,
416
+ "loss": 0.0393,
417
  "step": 1425
418
  },
419
  {
420
  "epoch": 6.53,
421
+ "grad_norm": 0.7307619452476501,
422
  "learning_rate": 7.900000000000001e-05,
423
+ "loss": 0.0398,
424
  "step": 1450
425
  },
426
  {
427
  "epoch": 6.64,
428
+ "grad_norm": 0.776818037033081,
429
  "learning_rate": 7.844444444444446e-05,
430
+ "loss": 0.0417,
431
  "step": 1475
432
  },
433
  {
434
  "epoch": 6.76,
435
+ "grad_norm": 1.0701009035110474,
436
  "learning_rate": 7.788888888888888e-05,
437
+ "loss": 0.044,
438
  "step": 1500
439
  },
440
  {
441
  "epoch": 6.87,
442
+ "grad_norm": 0.9106244444847107,
443
  "learning_rate": 7.733333333333333e-05,
444
+ "loss": 0.0449,
445
  "step": 1525
446
  },
447
  {
448
  "epoch": 6.98,
449
+ "grad_norm": 0.8038358688354492,
450
  "learning_rate": 7.677777777777778e-05,
451
+ "loss": 0.0443,
452
  "step": 1550
453
  },
454
  {
455
  "epoch": 7.09,
456
+ "grad_norm": 0.8888857364654541,
457
  "learning_rate": 7.622222222222223e-05,
458
+ "loss": 0.0295,
459
  "step": 1575
460
  },
461
  {
462
  "epoch": 7.21,
463
+ "grad_norm": 0.7159153819084167,
464
  "learning_rate": 7.566666666666667e-05,
465
+ "loss": 0.0235,
466
  "step": 1600
467
  },
468
  {
469
  "epoch": 7.32,
470
+ "grad_norm": 0.9652548432350159,
471
  "learning_rate": 7.511111111111111e-05,
472
+ "loss": 0.0276,
473
  "step": 1625
474
  },
475
  {
476
  "epoch": 7.43,
477
+ "grad_norm": 0.9230145215988159,
478
  "learning_rate": 7.455555555555556e-05,
479
+ "loss": 0.0253,
480
  "step": 1650
481
  },
482
  {
483
  "epoch": 7.55,
484
+ "grad_norm": 1.6814730167388916,
485
  "learning_rate": 7.4e-05,
486
+ "loss": 0.029,
487
  "step": 1675
488
  },
489
  {
490
  "epoch": 7.66,
491
+ "grad_norm": 1.036941409111023,
492
  "learning_rate": 7.344444444444445e-05,
493
+ "loss": 0.0262,
494
  "step": 1700
495
  },
496
  {
497
  "epoch": 7.77,
498
+ "grad_norm": 0.6998699307441711,
499
  "learning_rate": 7.28888888888889e-05,
500
+ "loss": 0.027,
501
  "step": 1725
502
  },
503
  {
504
  "epoch": 7.88,
505
+ "grad_norm": 0.7357670068740845,
506
  "learning_rate": 7.233333333333335e-05,
507
+ "loss": 0.0254,
508
  "step": 1750
509
  },
510
  {
511
  "epoch": 8.0,
512
+ "grad_norm": 2.0645382404327393,
513
  "learning_rate": 7.177777777777777e-05,
514
+ "loss": 0.0299,
515
  "step": 1775
516
  },
517
  {
518
  "epoch": 8.11,
519
+ "grad_norm": 0.7657965421676636,
520
  "learning_rate": 7.122222222222222e-05,
521
+ "loss": 0.0169,
522
  "step": 1800
523
  },
524
  {
525
  "epoch": 8.22,
526
+ "grad_norm": 1.1174191236495972,
527
  "learning_rate": 7.066666666666667e-05,
528
+ "loss": 0.0142,
529
  "step": 1825
530
  },
531
  {
532
  "epoch": 8.33,
533
+ "grad_norm": 1.5552918910980225,
534
  "learning_rate": 7.011111111111112e-05,
535
+ "loss": 0.0169,
536
  "step": 1850
537
  },
538
  {
539
  "epoch": 8.45,
540
+ "grad_norm": 0.6741360425949097,
541
  "learning_rate": 6.955555555555556e-05,
542
+ "loss": 0.0171,
543
  "step": 1875
544
  },
545
  {
546
  "epoch": 8.56,
547
+ "grad_norm": 0.6866123080253601,
548
  "learning_rate": 6.9e-05,
549
+ "loss": 0.017,
550
  "step": 1900
551
  },
552
  {
553
  "epoch": 8.67,
554
+ "grad_norm": 0.5000187754631042,
555
  "learning_rate": 6.844444444444445e-05,
556
+ "loss": 0.0158,
557
  "step": 1925
558
  },
559
  {
560
  "epoch": 8.78,
561
+ "grad_norm": 0.7297791838645935,
562
  "learning_rate": 6.788888888888888e-05,
563
+ "loss": 0.0175,
564
  "step": 1950
565
  },
566
  {
567
  "epoch": 8.9,
568
+ "grad_norm": 0.7372362017631531,
569
  "learning_rate": 6.733333333333333e-05,
570
+ "loss": 0.0156,
571
  "step": 1975
572
  },
573
  {
574
  "epoch": 9.01,
575
+ "grad_norm": 0.7431686520576477,
576
  "learning_rate": 6.677777777777779e-05,
577
+ "loss": 0.0165,
578
  "step": 2000
579
  },
580
  {
581
  "epoch": 9.01,
582
+ "eval_loss": 0.3495735228061676,
583
+ "eval_runtime": 1282.4445,
584
+ "eval_samples_per_second": 2.435,
585
+ "eval_steps_per_second": 0.076,
586
+ "eval_wer": 0.30066445182724255,
587
  "step": 2000
588
  },
589
  {
590
  "epoch": 9.12,
591
+ "grad_norm": 0.7414669990539551,
592
  "learning_rate": 6.622222222222224e-05,
593
  "loss": 0.0124,
594
  "step": 2025
595
  },
596
  {
597
  "epoch": 9.23,
598
+ "grad_norm": 0.7123928666114807,
599
  "learning_rate": 6.566666666666666e-05,
600
+ "loss": 0.0111,
601
  "step": 2050
602
  },
603
  {
604
  "epoch": 9.35,
605
+ "grad_norm": 0.5316759943962097,
606
  "learning_rate": 6.511111111111111e-05,
607
+ "loss": 0.0111,
608
  "step": 2075
609
  },
610
  {
611
  "epoch": 9.46,
612
+ "grad_norm": 0.2563645839691162,
613
  "learning_rate": 6.455555555555556e-05,
614
+ "loss": 0.012,
615
  "step": 2100
616
  },
617
  {
618
  "epoch": 9.57,
619
+ "grad_norm": 1.2440812587738037,
620
  "learning_rate": 6.400000000000001e-05,
621
+ "loss": 0.0106,
622
  "step": 2125
623
  },
624
  {
625
  "epoch": 9.68,
626
+ "grad_norm": 1.0652848482131958,
627
  "learning_rate": 6.344444444444445e-05,
628
+ "loss": 0.0082,
629
  "step": 2150
630
  },
631
  {
632
  "epoch": 9.8,
633
+ "grad_norm": 1.4494587182998657,
634
  "learning_rate": 6.28888888888889e-05,
635
+ "loss": 0.0089,
636
  "step": 2175
637
  },
638
  {
639
  "epoch": 9.91,
640
+ "grad_norm": 1.4650260210037231,
641
  "learning_rate": 6.233333333333334e-05,
642
+ "loss": 0.0087,
643
  "step": 2200
644
  },
645
  {
646
  "epoch": 10.02,
647
+ "grad_norm": 0.8734236359596252,
648
  "learning_rate": 6.177777777777779e-05,
649
+ "loss": 0.0081,
650
  "step": 2225
651
  },
652
  {
653
  "epoch": 10.14,
654
+ "grad_norm": 0.3450920283794403,
655
  "learning_rate": 6.122222222222222e-05,
656
+ "loss": 0.0083,
657
  "step": 2250
658
  },
659
  {
660
  "epoch": 10.25,
661
+ "grad_norm": 0.21007080376148224,
662
  "learning_rate": 6.066666666666667e-05,
663
+ "loss": 0.0084,
664
  "step": 2275
665
  },
666
  {
667
  "epoch": 10.36,
668
+ "grad_norm": 0.646001935005188,
669
  "learning_rate": 6.011111111111112e-05,
670
+ "loss": 0.0093,
671
  "step": 2300
672
  },
673
  {
674
  "epoch": 10.47,
675
+ "grad_norm": 0.2686345875263214,
676
  "learning_rate": 5.9555555555555554e-05,
677
+ "loss": 0.009,
678
  "step": 2325
679
  },
680
  {
681
  "epoch": 10.59,
682
+ "grad_norm": 0.4910290241241455,
683
  "learning_rate": 5.9e-05,
684
+ "loss": 0.0076,
685
  "step": 2350
686
  },
687
  {
688
  "epoch": 10.7,
689
+ "grad_norm": 0.2715873718261719,
690
  "learning_rate": 5.844444444444445e-05,
691
+ "loss": 0.0068,
692
  "step": 2375
693
  },
694
  {
695
  "epoch": 10.81,
696
+ "grad_norm": 0.4012056589126587,
697
  "learning_rate": 5.788888888888889e-05,
698
+ "loss": 0.009,
699
  "step": 2400
700
  },
701
  {
702
  "epoch": 10.92,
703
+ "grad_norm": 0.7254907488822937,
704
  "learning_rate": 5.7333333333333336e-05,
705
+ "loss": 0.0083,
706
  "step": 2425
707
  },
708
  {
709
  "epoch": 11.04,
710
+ "grad_norm": 0.8048310279846191,
711
  "learning_rate": 5.6777777777777786e-05,
712
+ "loss": 0.007,
713
  "step": 2450
714
  },
715
  {
716
  "epoch": 11.15,
717
+ "grad_norm": 0.4055149555206299,
718
  "learning_rate": 5.622222222222222e-05,
719
+ "loss": 0.0063,
720
  "step": 2475
721
  },
722
  {
723
  "epoch": 11.26,
724
+ "grad_norm": 0.4453238248825073,
725
  "learning_rate": 5.566666666666667e-05,
726
+ "loss": 0.0061,
727
  "step": 2500
728
  },
729
  {
730
  "epoch": 11.37,
731
+ "grad_norm": 0.7719666361808777,
732
  "learning_rate": 5.511111111111111e-05,
733
+ "loss": 0.006,
734
  "step": 2525
735
  },
736
  {
737
  "epoch": 11.49,
738
+ "grad_norm": 0.6845406889915466,
739
  "learning_rate": 5.455555555555556e-05,
740
+ "loss": 0.006,
741
  "step": 2550
742
  },
743
  {
744
  "epoch": 11.6,
745
+ "grad_norm": 1.624002456665039,
746
  "learning_rate": 5.4000000000000005e-05,
747
+ "loss": 0.006,
748
  "step": 2575
749
  },
750
  {
751
  "epoch": 11.71,
752
+ "grad_norm": 0.28533199429512024,
753
  "learning_rate": 5.3444444444444455e-05,
754
+ "loss": 0.006,
755
  "step": 2600
756
  },
757
  {
758
  "epoch": 11.82,
759
+ "grad_norm": 0.6341890096664429,
760
  "learning_rate": 5.2888888888888885e-05,
761
+ "loss": 0.0058,
762
  "step": 2625
763
  },
764
  {
765
  "epoch": 11.94,
766
+ "grad_norm": 0.8105676770210266,
767
  "learning_rate": 5.2333333333333336e-05,
768
+ "loss": 0.0048,
769
  "step": 2650
770
  },
771
  {
772
  "epoch": 12.05,
773
+ "grad_norm": 0.4783516824245453,
774
  "learning_rate": 5.177777777777778e-05,
775
+ "loss": 0.006,
776
  "step": 2675
777
  },
778
  {
779
  "epoch": 12.16,
780
+ "grad_norm": 0.8321937322616577,
781
  "learning_rate": 5.122222222222223e-05,
782
+ "loss": 0.0042,
783
  "step": 2700
784
  },
785
  {
786
  "epoch": 12.27,
787
+ "grad_norm": 0.1419239044189453,
788
  "learning_rate": 5.0666666666666674e-05,
789
+ "loss": 0.0045,
790
  "step": 2725
791
  },
792
  {
793
  "epoch": 12.39,
794
+ "grad_norm": 0.19317950308322906,
795
  "learning_rate": 5.011111111111111e-05,
796
+ "loss": 0.0036,
797
  "step": 2750
798
  },
799
  {
800
  "epoch": 12.5,
801
+ "grad_norm": 0.13593174517154694,
802
  "learning_rate": 4.955555555555556e-05,
803
+ "loss": 0.004,
804
  "step": 2775
805
  },
806
  {
807
  "epoch": 12.61,
808
+ "grad_norm": 0.3188939094543457,
809
  "learning_rate": 4.9e-05,
810
+ "loss": 0.0034,
811
  "step": 2800
812
  },
813
  {
814
  "epoch": 12.73,
815
+ "grad_norm": 0.31336820125579834,
816
  "learning_rate": 4.844444444444445e-05,
817
+ "loss": 0.0035,
818
  "step": 2825
819
  },
820
  {
821
  "epoch": 12.84,
822
+ "grad_norm": 0.25262606143951416,
823
  "learning_rate": 4.7888888888888886e-05,
824
+ "loss": 0.0034,
825
  "step": 2850
826
  },
827
  {
828
  "epoch": 12.95,
829
+ "grad_norm": 0.4232746362686157,
830
  "learning_rate": 4.7333333333333336e-05,
831
+ "loss": 0.0036,
832
  "step": 2875
833
  },
834
  {
835
  "epoch": 13.06,
836
+ "grad_norm": 0.37615010142326355,
837
  "learning_rate": 4.677777777777778e-05,
838
+ "loss": 0.0034,
839
  "step": 2900
840
  },
841
  {
842
  "epoch": 13.18,
843
+ "grad_norm": 0.13130201399326324,
844
  "learning_rate": 4.6222222222222224e-05,
845
+ "loss": 0.0024,
846
  "step": 2925
847
  },
848
  {
849
  "epoch": 13.29,
850
+ "grad_norm": 0.31216856837272644,
851
  "learning_rate": 4.566666666666667e-05,
852
+ "loss": 0.0034,
853
  "step": 2950
854
  },
855
  {
856
  "epoch": 13.4,
857
+ "grad_norm": 0.06603355705738068,
858
  "learning_rate": 4.511111111111112e-05,
859
+ "loss": 0.0019,
860
  "step": 2975
861
  },
862
  {
863
  "epoch": 13.51,
864
+ "grad_norm": 0.2945907711982727,
865
  "learning_rate": 4.4555555555555555e-05,
866
+ "loss": 0.0022,
867
  "step": 3000
868
  },
869
  {
870
  "epoch": 13.51,
871
+ "eval_loss": 0.36493760347366333,
872
+ "eval_runtime": 1357.8705,
873
+ "eval_samples_per_second": 2.3,
874
+ "eval_steps_per_second": 0.072,
875
+ "eval_wer": 0.2786061963995982,
876
  "step": 3000
877
  },
878
  {
879
  "epoch": 13.63,
880
+ "grad_norm": 0.2547701597213745,
881
  "learning_rate": 4.4000000000000006e-05,
882
+ "loss": 0.002,
883
  "step": 3025
884
  },
885
  {
886
  "epoch": 13.74,
887
+ "grad_norm": 0.7029439806938171,
888
  "learning_rate": 4.344444444444445e-05,
889
+ "loss": 0.002,
890
  "step": 3050
891
  },
892
  {
893
  "epoch": 13.85,
894
+ "grad_norm": 0.821772038936615,
895
  "learning_rate": 4.2888888888888886e-05,
896
+ "loss": 0.0025,
897
  "step": 3075
898
  },
899
  {
900
  "epoch": 13.96,
901
+ "grad_norm": 0.17252065241336823,
902
  "learning_rate": 4.233333333333334e-05,
903
+ "loss": 0.0027,
904
  "step": 3100
905
  },
906
  {
907
  "epoch": 14.08,
908
+ "grad_norm": 0.18165266513824463,
909
  "learning_rate": 4.177777777777778e-05,
910
+ "loss": 0.0022,
911
  "step": 3125
912
  },
913
  {
914
  "epoch": 14.19,
915
+ "grad_norm": 0.11963178962469101,
916
  "learning_rate": 4.1222222222222224e-05,
917
+ "loss": 0.0023,
918
  "step": 3150
919
  },
920
  {
921
  "epoch": 14.3,
922
+ "grad_norm": 1.3975796699523926,
923
  "learning_rate": 4.066666666666667e-05,
924
+ "loss": 0.0024,
925
  "step": 3175
926
  },
927
  {
928
  "epoch": 14.41,
929
+ "grad_norm": 0.3654703199863434,
930
  "learning_rate": 4.011111111111111e-05,
931
+ "loss": 0.0026,
932
  "step": 3200
933
  },
934
  {
935
  "epoch": 14.53,
936
+ "grad_norm": 0.22537653148174286,
937
  "learning_rate": 3.9555555555555556e-05,
938
+ "loss": 0.003,
939
  "step": 3225
940
  },
941
  {
942
  "epoch": 14.64,
943
+ "grad_norm": 0.531537652015686,
944
  "learning_rate": 3.9000000000000006e-05,
945
+ "loss": 0.0025,
946
  "step": 3250
947
  },
948
  {
949
  "epoch": 14.75,
950
+ "grad_norm": 0.09146568179130554,
951
  "learning_rate": 3.844444444444444e-05,
952
+ "loss": 0.0023,
953
  "step": 3275
954
  },
955
  {
956
  "epoch": 14.86,
957
+ "grad_norm": 0.3482789993286133,
958
  "learning_rate": 3.7888888888888894e-05,
959
+ "loss": 0.0028,
960
  "step": 3300
961
  },
962
  {
963
  "epoch": 14.98,
964
+ "grad_norm": 0.15211211144924164,
965
  "learning_rate": 3.733333333333334e-05,
966
+ "loss": 0.0032,
967
  "step": 3325
968
  },
969
  {
970
  "epoch": 15.09,
971
+ "grad_norm": 0.02796722762286663,
972
  "learning_rate": 3.677777777777778e-05,
973
+ "loss": 0.002,
974
  "step": 3350
975
  },
976
  {
977
  "epoch": 15.2,
978
+ "grad_norm": 0.022246429696679115,
979
  "learning_rate": 3.6222222222222225e-05,
980
+ "loss": 0.0019,
981
  "step": 3375
982
  },
983
  {
984
  "epoch": 15.32,
985
+ "grad_norm": 0.2861407995223999,
986
  "learning_rate": 3.566666666666667e-05,
987
+ "loss": 0.0029,
988
  "step": 3400
989
  },
990
  {
991
  "epoch": 15.43,
992
+ "grad_norm": 0.24615710973739624,
993
  "learning_rate": 3.511111111111111e-05,
994
+ "loss": 0.003,
995
  "step": 3425
996
  },
997
  {
998
  "epoch": 15.54,
999
+ "grad_norm": 0.20990662276744843,
1000
  "learning_rate": 3.4555555555555556e-05,
1001
+ "loss": 0.003,
1002
  "step": 3450
1003
  },
1004
  {
1005
  "epoch": 15.65,
1006
+ "grad_norm": 0.21401448547840118,
1007
  "learning_rate": 3.4000000000000007e-05,
1008
+ "loss": 0.0017,
1009
  "step": 3475
1010
  },
1011
  {
1012
  "epoch": 15.77,
1013
+ "grad_norm": 0.04294591024518013,
1014
  "learning_rate": 3.3444444444444443e-05,
1015
+ "loss": 0.0019,
1016
  "step": 3500
1017
  },
1018
  {
1019
  "epoch": 15.88,
1020
+ "grad_norm": 0.7296904921531677,
1021
  "learning_rate": 3.2888888888888894e-05,
1022
+ "loss": 0.002,
1023
  "step": 3525
1024
  },
1025
  {
1026
  "epoch": 15.99,
1027
+ "grad_norm": 0.23458455502986908,
1028
  "learning_rate": 3.233333333333333e-05,
1029
+ "loss": 0.0016,
1030
  "step": 3550
1031
  },
1032
  {
1033
  "epoch": 16.1,
1034
+ "grad_norm": 0.02373860962688923,
1035
  "learning_rate": 3.177777777777778e-05,
1036
+ "loss": 0.0007,
1037
  "step": 3575
1038
  },
1039
  {
1040
  "epoch": 16.22,
1041
+ "grad_norm": 0.028949666768312454,
1042
  "learning_rate": 3.1222222222222225e-05,
1043
+ "loss": 0.0017,
1044
  "step": 3600
1045
  },
1046
  {
1047
  "epoch": 16.33,
1048
+ "grad_norm": 0.01981484517455101,
1049
  "learning_rate": 3.066666666666667e-05,
1050
+ "loss": 0.0013,
1051
  "step": 3625
1052
  },
1053
  {
1054
  "epoch": 16.44,
1055
+ "grad_norm": 0.017177563160657883,
1056
  "learning_rate": 3.0111111111111113e-05,
1057
+ "loss": 0.0014,
1058
  "step": 3650
1059
  },
1060
  {
1061
  "epoch": 16.55,
1062
+ "grad_norm": 0.021722471341490746,
1063
  "learning_rate": 2.955555555555556e-05,
1064
+ "loss": 0.0008,
1065
  "step": 3675
1066
  },
1067
  {
1068
  "epoch": 16.67,
1069
+ "grad_norm": 0.10685452073812485,
1070
  "learning_rate": 2.9e-05,
1071
+ "loss": 0.0013,
1072
  "step": 3700
1073
  },
1074
  {
1075
  "epoch": 16.78,
1076
+ "grad_norm": 0.01891660876572132,
1077
  "learning_rate": 2.8444444444444447e-05,
1078
  "loss": 0.0007,
1079
  "step": 3725
1080
  },
1081
  {
1082
  "epoch": 16.89,
1083
+ "grad_norm": 0.021835455670952797,
1084
  "learning_rate": 2.788888888888889e-05,
1085
+ "loss": 0.0011,
1086
  "step": 3750
1087
  },
1088
  {
1089
  "epoch": 17.0,
1090
+ "grad_norm": 0.017706584185361862,
1091
  "learning_rate": 2.733333333333333e-05,
1092
+ "loss": 0.0006,
1093
  "step": 3775
1094
  },
1095
  {
1096
  "epoch": 17.12,
1097
+ "grad_norm": 0.2573525011539459,
1098
  "learning_rate": 2.677777777777778e-05,
1099
+ "loss": 0.0013,
1100
  "step": 3800
1101
  },
1102
  {
1103
  "epoch": 17.23,
1104
+ "grad_norm": 0.015161894261837006,
1105
  "learning_rate": 2.6222222222222226e-05,
1106
+ "loss": 0.0004,
1107
  "step": 3825
1108
  },
1109
  {
1110
  "epoch": 17.34,
1111
+ "grad_norm": 0.017167283222079277,
1112
  "learning_rate": 2.5666666666666666e-05,
1113
+ "loss": 0.0005,
1114
  "step": 3850
1115
  },
1116
  {
1117
  "epoch": 17.45,
1118
+ "grad_norm": 0.019201019778847694,
1119
  "learning_rate": 2.5111111111111113e-05,
1120
+ "loss": 0.0005,
1121
  "step": 3875
1122
  },
1123
  {
1124
  "epoch": 17.57,
1125
+ "grad_norm": 0.024687746539711952,
1126
  "learning_rate": 2.4555555555555557e-05,
1127
+ "loss": 0.0006,
1128
  "step": 3900
1129
  },
1130
  {
1131
  "epoch": 17.68,
1132
+ "grad_norm": 0.016668912023305893,
1133
  "learning_rate": 2.4e-05,
1134
+ "loss": 0.0006,
1135
  "step": 3925
1136
  },
1137
  {
1138
  "epoch": 17.79,
1139
+ "grad_norm": 0.013516202569007874,
1140
  "learning_rate": 2.3444444444444448e-05,
1141
+ "loss": 0.0006,
1142
  "step": 3950
1143
  },
1144
  {
1145
  "epoch": 17.91,
1146
+ "grad_norm": 0.012521643191576004,
1147
  "learning_rate": 2.288888888888889e-05,
1148
+ "loss": 0.0006,
1149
  "step": 3975
1150
  },
1151
  {
1152
  "epoch": 18.02,
1153
+ "grad_norm": 0.013049867004156113,
1154
  "learning_rate": 2.2333333333333335e-05,
1155
+ "loss": 0.0011,
1156
  "step": 4000
1157
  },
1158
  {
1159
  "epoch": 18.02,
1160
+ "eval_loss": 0.3699657618999481,
1161
+ "eval_runtime": 1350.6575,
1162
+ "eval_samples_per_second": 2.312,
1163
+ "eval_steps_per_second": 0.073,
1164
+ "eval_wer": 0.2680985861083211,
1165
  "step": 4000
1166
  },
1167
  {
1168
  "epoch": 18.13,
1169
+ "grad_norm": 0.0116911381483078,
1170
  "learning_rate": 2.177777777777778e-05,
1171
+ "loss": 0.0005,
1172
  "step": 4025
1173
  },
1174
  {
1175
  "epoch": 18.24,
1176
+ "grad_norm": 0.011522598564624786,
1177
  "learning_rate": 2.1222222222222223e-05,
1178
+ "loss": 0.0007,
1179
  "step": 4050
1180
  },
1181
  {
1182
  "epoch": 18.36,
1183
+ "grad_norm": 0.013516987673938274,
1184
  "learning_rate": 2.0666666666666666e-05,
1185
+ "loss": 0.0004,
1186
  "step": 4075
1187
  },
1188
  {
1189
  "epoch": 18.47,
1190
+ "grad_norm": 0.021075060591101646,
1191
  "learning_rate": 2.011111111111111e-05,
1192
+ "loss": 0.0006,
1193
  "step": 4100
1194
  },
1195
  {
1196
  "epoch": 18.58,
1197
+ "grad_norm": 0.012502779252827168,
1198
  "learning_rate": 1.9555555555555557e-05,
1199
+ "loss": 0.0004,
1200
  "step": 4125
1201
  },
1202
  {
1203
  "epoch": 18.69,
1204
+ "grad_norm": 0.011508314870297909,
1205
  "learning_rate": 1.9e-05,
1206
  "loss": 0.0003,
1207
  "step": 4150
1208
  },
1209
  {
1210
  "epoch": 18.81,
1211
+ "grad_norm": 0.013346145860850811,
1212
  "learning_rate": 1.8444444444444445e-05,
1213
+ "loss": 0.0004,
1214
  "step": 4175
1215
  },
1216
  {
1217
  "epoch": 18.92,
1218
+ "grad_norm": 0.011840825900435448,
1219
  "learning_rate": 1.788888888888889e-05,
1220
+ "loss": 0.0004,
1221
  "step": 4200
1222
  },
1223
  {
1224
  "epoch": 19.03,
1225
+ "grad_norm": 0.009515935555100441,
1226
  "learning_rate": 1.7333333333333336e-05,
1227
  "loss": 0.0004,
1228
  "step": 4225
1229
  },
1230
  {
1231
  "epoch": 19.14,
1232
+ "grad_norm": 0.01005704328417778,
1233
  "learning_rate": 1.677777777777778e-05,
1234
  "loss": 0.0003,
1235
  "step": 4250
1236
  },
1237
  {
1238
  "epoch": 19.26,
1239
+ "grad_norm": 0.010899914428591728,
1240
  "learning_rate": 1.6222222222222223e-05,
1241
  "loss": 0.0003,
1242
  "step": 4275
1243
  },
1244
  {
1245
  "epoch": 19.37,
1246
+ "grad_norm": 0.007188358344137669,
1247
  "learning_rate": 1.5666666666666667e-05,
1248
  "loss": 0.0003,
1249
  "step": 4300
1250
  },
1251
  {
1252
  "epoch": 19.48,
1253
+ "grad_norm": 0.00955110415816307,
1254
  "learning_rate": 1.5111111111111112e-05,
1255
+ "loss": 0.0003,
1256
  "step": 4325
1257
  },
1258
  {
1259
  "epoch": 19.59,
1260
+ "grad_norm": 0.007988874800503254,
1261
  "learning_rate": 1.4555555555555556e-05,
1262
  "loss": 0.0003,
1263
  "step": 4350
1264
  },
1265
  {
1266
  "epoch": 19.71,
1267
+ "grad_norm": 0.009029334411025047,
1268
  "learning_rate": 1.4000000000000001e-05,
1269
+ "loss": 0.0005,
1270
  "step": 4375
1271
  },
1272
  {
1273
  "epoch": 19.82,
1274
+ "grad_norm": 0.01156931184232235,
1275
  "learning_rate": 1.3444444444444445e-05,
1276
  "loss": 0.0003,
1277
  "step": 4400
1278
  },
1279
  {
1280
  "epoch": 19.93,
1281
+ "grad_norm": 0.01129342895001173,
1282
  "learning_rate": 1.2888888888888889e-05,
1283
+ "loss": 0.0003,
1284
  "step": 4425
1285
  },
1286
  {
1287
  "epoch": 20.05,
1288
+ "grad_norm": 0.00786085519939661,
1289
  "learning_rate": 1.2333333333333334e-05,
1290
  "loss": 0.0004,
1291
  "step": 4450
1292
  },
1293
  {
1294
  "epoch": 20.16,
1295
+ "grad_norm": 0.009362996555864811,
1296
  "learning_rate": 1.1777777777777778e-05,
1297
  "loss": 0.0003,
1298
  "step": 4475
1299
  },
1300
  {
1301
  "epoch": 20.27,
1302
+ "grad_norm": 0.008518415503203869,
1303
  "learning_rate": 1.1222222222222224e-05,
1304
  "loss": 0.0003,
1305
  "step": 4500
1306
  },
1307
  {
1308
  "epoch": 20.38,
1309
+ "grad_norm": 0.007652095053344965,
1310
  "learning_rate": 1.0666666666666667e-05,
1311
  "loss": 0.0003,
1312
  "step": 4525
1313
  },
1314
  {
1315
  "epoch": 20.5,
1316
+ "grad_norm": 0.008384721353650093,
1317
  "learning_rate": 1.0111111111111111e-05,
1318
  "loss": 0.0003,
1319
  "step": 4550
1320
  },
1321
  {
1322
  "epoch": 20.61,
1323
+ "grad_norm": 0.010271112434566021,
1324
  "learning_rate": 9.555555555555556e-06,
1325
  "loss": 0.0003,
1326
  "step": 4575
1327
  },
1328
  {
1329
  "epoch": 20.72,
1330
+ "grad_norm": 0.0075312405824661255,
1331
  "learning_rate": 9e-06,
1332
+ "loss": 0.0004,
1333
  "step": 4600
1334
  },
1335
  {
1336
  "epoch": 20.83,
1337
+ "grad_norm": 0.009318512864410877,
1338
  "learning_rate": 8.444444444444446e-06,
1339
  "loss": 0.0003,
1340
  "step": 4625
1341
  },
1342
  {
1343
  "epoch": 20.95,
1344
+ "grad_norm": 0.0078095910139381886,
1345
  "learning_rate": 7.88888888888889e-06,
1346
+ "loss": 0.0003,
1347
  "step": 4650
1348
  },
1349
  {
1350
  "epoch": 21.06,
1351
+ "grad_norm": 0.00864331517368555,
1352
  "learning_rate": 7.333333333333334e-06,
1353
  "loss": 0.0003,
1354
  "step": 4675
1355
  },
1356
  {
1357
  "epoch": 21.17,
1358
+ "grad_norm": 0.007982113398611546,
1359
  "learning_rate": 6.777777777777779e-06,
1360
  "loss": 0.0003,
1361
  "step": 4700
1362
  },
1363
  {
1364
  "epoch": 21.28,
1365
+ "grad_norm": 0.009959988296031952,
1366
  "learning_rate": 6.222222222222222e-06,
1367
  "loss": 0.0003,
1368
  "step": 4725
1369
  },
1370
  {
1371
  "epoch": 21.4,
1372
+ "grad_norm": 0.006970000918954611,
1373
  "learning_rate": 5.666666666666667e-06,
1374
  "loss": 0.0003,
1375
  "step": 4750
1376
  },
1377
  {
1378
  "epoch": 21.51,
1379
+ "grad_norm": 0.0091372299939394,
1380
  "learning_rate": 5.1111111111111115e-06,
1381
  "loss": 0.0003,
1382
  "step": 4775
1383
  },
1384
  {
1385
  "epoch": 21.62,
1386
+ "grad_norm": 0.010007310658693314,
1387
  "learning_rate": 4.555555555555556e-06,
1388
  "loss": 0.0003,
1389
  "step": 4800
1390
  },
1391
  {
1392
  "epoch": 21.73,
1393
+ "grad_norm": 0.008494430221617222,
1394
  "learning_rate": 4.000000000000001e-06,
1395
+ "loss": 0.0003,
1396
  "step": 4825
1397
  },
1398
  {
1399
  "epoch": 21.85,
1400
+ "grad_norm": 0.0071678003296256065,
1401
  "learning_rate": 3.4444444444444444e-06,
1402
  "loss": 0.0003,
1403
  "step": 4850
1404
  },
1405
  {
1406
  "epoch": 21.96,
1407
+ "grad_norm": 0.00614485889673233,
1408
  "learning_rate": 2.888888888888889e-06,
1409
  "loss": 0.0003,
1410
  "step": 4875
1411
  },
1412
  {
1413
  "epoch": 22.07,
1414
+ "grad_norm": 0.008994187228381634,
1415
  "learning_rate": 2.3333333333333336e-06,
1416
  "loss": 0.0003,
1417
  "step": 4900
1418
  },
1419
  {
1420
  "epoch": 22.18,
1421
+ "grad_norm": 0.008750267326831818,
1422
  "learning_rate": 1.777777777777778e-06,
1423
  "loss": 0.0003,
1424
  "step": 4925
1425
  },
1426
  {
1427
  "epoch": 22.3,
1428
+ "grad_norm": 0.007052999921143055,
1429
  "learning_rate": 1.2222222222222223e-06,
1430
  "loss": 0.0002,
1431
  "step": 4950
1432
  },
1433
  {
1434
  "epoch": 22.41,
1435
+ "grad_norm": 0.007113702595233917,
1436
  "learning_rate": 6.666666666666667e-07,
1437
  "loss": 0.0002,
1438
  "step": 4975
1439
  },
1440
  {
1441
  "epoch": 22.52,
1442
+ "grad_norm": 0.008453252725303173,
1443
  "learning_rate": 1.1111111111111112e-07,
1444
  "loss": 0.0003,
1445
  "step": 5000
1446
  },
1447
  {
1448
  "epoch": 22.52,
1449
+ "eval_loss": 0.37487614154815674,
1450
+ "eval_runtime": 1348.7243,
1451
+ "eval_samples_per_second": 2.316,
1452
+ "eval_steps_per_second": 0.073,
1453
+ "eval_wer": 0.26639882562002626,
1454
  "step": 5000
1455
  },
1456
  {
1457
  "epoch": 22.52,
1458
  "step": 5000,
1459
  "total_flos": 2.532745423355904e+20,
1460
+ "train_loss": 0.1750582966186106,
1461
+ "train_runtime": 15499.9794,
1462
+ "train_samples_per_second": 10.323,
1463
+ "train_steps_per_second": 0.323
1464
  }
1465
  ],
1466
  "logging_steps": 25,
wandb/debug-internal.log CHANGED
The diff for this file is too large to render. See raw diff
 
wandb/run-20240327_190513-7p2x8a0l/files/output.log CHANGED
@@ -4830,3 +4830,122 @@ Non-default generation parameters: {'max_length': 448, 'begin_suppress_tokens':
4830
  [WARNING|configuration_utils.py:447] 2024-03-27 23:25:02,985 >> Some non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4831
  [WARNING|configuration_utils.py:447] 2024-03-27 23:25:02,985 >> Some non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4832
  [WARNING|configuration_utils.py:447] 2024-03-27 23:25:02,985 >> Some non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4830
  [WARNING|configuration_utils.py:447] 2024-03-27 23:25:02,985 >> Some non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4831
  [WARNING|configuration_utils.py:447] 2024-03-27 23:25:02,985 >> Some non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4832
  [WARNING|configuration_utils.py:447] 2024-03-27 23:25:02,985 >> Some non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4833
+ [WARNING|configuration_utils.py:447] 2024-03-27 23:25:02,985 >> Some non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4834
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4835
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4836
+ ***** train metrics *****
4837
+ epoch = 22.52
4838
+ train_loss = 0.1751
4839
+ train_runtime = 4:18:19.97
4840
+ train_samples = 7099
4841
+ train_samples_per_second = 10.323
4842
+ train_steps_per_second = 0.323
4843
+ 03/27/2024 23:25:17 - INFO - __main__ - *** Evaluate ***
4844
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4845
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4846
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4847
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4848
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4849
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4850
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4851
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4852
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4853
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4854
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4855
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4856
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4857
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4858
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4859
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4860
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4861
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4862
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4863
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4864
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4865
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4866
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4867
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4868
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4869
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4870
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4871
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4872
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4873
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4874
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4875
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4876
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4877
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4878
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4879
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4880
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4881
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4882
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4883
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4884
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4885
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4886
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4887
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4888
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4889
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4890
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4891
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4892
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4893
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4894
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4895
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4896
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4897
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4898
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4899
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4900
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4901
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4902
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4903
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4904
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4905
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4906
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4907
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4908
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4909
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4910
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4911
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4912
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4913
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4914
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4915
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4916
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4917
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4918
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4919
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4920
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4921
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4922
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4923
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4924
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4925
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4926
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4927
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4928
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4929
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4930
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4931
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4932
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4933
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4934
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4935
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4936
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4937
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4938
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4939
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4940
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4941
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4942
+ [INFO|trainer.py:3549] 2024-03-27 23:25:17,894 >> Batch size = 32e non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4943
+ ***** eval metrics *****
4944
+ epoch = 22.52
4945
+ eval_loss = 0.3749
4946
+ eval_runtime = 0:22:25.02
4947
+ eval_samples = 3123
4948
+ eval_samples_per_second = 2.322
4949
+ eval_steps_per_second = 0.073
4950
+ Non-default generation parameters: {'max_length': 448, 'begin_suppress_tokens': [220, 50257]}arameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
4951
+ Non-default generation parameters: {'max_length': 448, 'begin_suppress_tokens': [220, 50257]}arameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
wandb/run-20240327_190513-7p2x8a0l/files/wandb-summary.json CHANGED
@@ -1 +1 @@
1
- {"train/loss": 0.0003, "train/grad_norm": 0.008453252725303173, "train/learning_rate": 1.1111111111111112e-07, "train/epoch": 22.52, "train/global_step": 5000, "_timestamp": 1711578212.4327524, "_runtime": 15498.784944295883, "_step": 205, "eval/loss": 0.37487614154815674, "eval/wer": 0.26639882562002626, "eval/runtime": 1348.7243, "eval/samples_per_second": 2.316, "eval/steps_per_second": 0.073, "train_runtime": 15499.9794, "train_samples_per_second": 10.323, "train_steps_per_second": 0.323, "total_flos": 2.532745423355904e+20, "train_loss": 0.1750582966186106}
 
1
+ {"train/loss": 0.0003, "train/grad_norm": 0.008453252725303173, "train/learning_rate": 1.1111111111111112e-07, "train/epoch": 22.52, "train/global_step": 5000, "_timestamp": 1711579662.9203484, "_runtime": 16949.272540330887, "_step": 206, "eval/loss": 0.37487614154815674, "eval/wer": 0.26639882562002626, "eval/runtime": 1345.0244, "eval/samples_per_second": 2.322, "eval/steps_per_second": 0.073, "train_runtime": 15499.9794, "train_samples_per_second": 10.323, "train_steps_per_second": 0.323, "total_flos": 2.532745423355904e+20, "train_loss": 0.1750582966186106}
wandb/run-20240327_190513-7p2x8a0l/logs/debug-internal.log CHANGED
The diff for this file is too large to render. See raw diff
 
wandb/run-20240327_190513-7p2x8a0l/run-7p2x8a0l.wandb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6ed5aa6d2654f239d8aa803a04c3f3113aab39361e15504d2efba3ba12465d24
3
- size 4529113
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e57b83715ac212c8caaeb38d450489ce7eb544fbe4d6ac222d7a1a7447549dcc
3
+ size 4818911