kimbochen commited on
Commit
cfae9e4
1 Parent(s): 617d6c4

Training in progress, step 1000

Browse files
.ipynb_checkpoints/fine-tune-whisper-streaming-checkpoint.ipynb CHANGED
@@ -108,7 +108,7 @@
108
  },
109
  {
110
  "cell_type": "code",
111
- "execution_count": null,
112
  "id": "065a8cf7-e54f-4ac3-900e-609c80714fca",
113
  "metadata": {},
114
  "outputs": [],
@@ -142,17 +142,74 @@
142
  },
143
  {
144
  "cell_type": "code",
145
- "execution_count": null,
146
  "id": "a2787582-554f-44ce-9f38-4180a5ed6b44",
147
  "metadata": {},
148
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  "source": [
150
  "from datasets import IterableDatasetDict\n",
151
  "\n",
152
  "raw_datasets = IterableDatasetDict()\n",
153
  "\n",
154
- "raw_datasets[\"train\"] = load_streaming_dataset(\"mozilla-foundation/common_voice_11_0\", \"es\", split=\"train\", use_auth_token=True) # set split=\"train+validation\" for low-resource\n",
155
- "raw_datasets[\"test\"] = load_streaming_dataset(\"mozilla-foundation/common_voice_11_0\", \"es\", split=\"test\", use_auth_token=True)"
156
  ]
157
  },
158
  {
@@ -185,14 +242,113 @@
185
  },
186
  {
187
  "cell_type": "code",
188
- "execution_count": null,
189
  "id": "77d9f0c5-8607-4642-a8ac-c3ab2e223ea6",
190
  "metadata": {},
191
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  "source": [
193
  "from transformers import WhisperProcessor\n",
194
  "\n",
195
- "processor = WhisperProcessor.from_pretrained(\"openai/whisper-small\", language=\"Spanish\", task=\"transcribe\")"
196
  ]
197
  },
198
  {
@@ -213,10 +369,31 @@
213
  },
214
  {
215
  "cell_type": "code",
216
- "execution_count": null,
217
  "id": "ab5a13b4-9bd4-4aa0-aef2-b3de9b762988",
218
  "metadata": {},
219
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  "source": [
221
  "raw_datasets[\"train\"].features"
222
  ]
@@ -238,7 +415,7 @@
238
  },
239
  {
240
  "cell_type": "code",
241
- "execution_count": null,
242
  "id": "3ab6a724-3d1e-478b-a9e9-d2f85feb6c39",
243
  "metadata": {},
244
  "outputs": [],
@@ -258,7 +435,7 @@
258
  },
259
  {
260
  "cell_type": "code",
261
- "execution_count": null,
262
  "id": "d041650e-1c48-4439-87b3-5b6f4a514107",
263
  "metadata": {},
264
  "outputs": [],
@@ -285,7 +462,7 @@
285
  },
286
  {
287
  "cell_type": "code",
288
- "execution_count": null,
289
  "id": "c085911c-a10a-41ef-8874-306e0503e9bb",
290
  "metadata": {},
291
  "outputs": [],
@@ -321,7 +498,7 @@
321
  },
322
  {
323
  "cell_type": "code",
324
- "execution_count": null,
325
  "id": "a37a7cdb-9013-427f-8de9-6a8d0e9dc684",
326
  "metadata": {},
327
  "outputs": [],
@@ -339,7 +516,7 @@
339
  },
340
  {
341
  "cell_type": "code",
342
- "execution_count": null,
343
  "id": "1b145699-acfc-4b1d-93a2-a2ad3d62674c",
344
  "metadata": {},
345
  "outputs": [],
@@ -360,7 +537,7 @@
360
  },
361
  {
362
  "cell_type": "code",
363
- "execution_count": null,
364
  "id": "01cb25ef-4bb0-4325-9461-f59198acadf6",
365
  "metadata": {},
366
  "outputs": [],
@@ -381,7 +558,7 @@
381
  },
382
  {
383
  "cell_type": "code",
384
- "execution_count": null,
385
  "id": "333f7f6e-6053-4d3b-8924-c733c79b82ac",
386
  "metadata": {},
387
  "outputs": [],
@@ -451,7 +628,7 @@
451
  },
452
  {
453
  "cell_type": "code",
454
- "execution_count": null,
455
  "id": "8326221e-ec13-4731-bb4e-51e5fc1486c5",
456
  "metadata": {},
457
  "outputs": [],
@@ -499,7 +676,7 @@
499
  },
500
  {
501
  "cell_type": "code",
502
- "execution_count": null,
503
  "id": "fc834702-c0d3-4a96-b101-7b87be32bf42",
504
  "metadata": {},
505
  "outputs": [],
@@ -526,10 +703,25 @@
526
  },
527
  {
528
  "cell_type": "code",
529
- "execution_count": null,
530
  "id": "b22b4011-f31f-4b57-b684-c52332f92890",
531
  "metadata": {},
532
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
533
  "source": [
534
  "import evaluate\n",
535
  "\n",
@@ -555,7 +747,7 @@
555
  },
556
  {
557
  "cell_type": "code",
558
- "execution_count": null,
559
  "id": "a11d1bfc-9e28-460f-a287-72d8f7bc1acb",
560
  "metadata": {},
561
  "outputs": [],
@@ -605,10 +797,39 @@
605
  },
606
  {
607
  "cell_type": "code",
608
- "execution_count": null,
609
  "id": "5a10cc4b-07ec-4ebd-ac1d-7c601023594f",
610
  "metadata": {},
611
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
612
  "source": [
613
  "from transformers import WhisperForConditionalGeneration\n",
614
  "\n",
@@ -625,7 +846,7 @@
625
  },
626
  {
627
  "cell_type": "code",
628
- "execution_count": null,
629
  "id": "62038ba3-88ed-4fce-84db-338f50dcd04f",
630
  "metadata": {},
631
  "outputs": [],
@@ -653,7 +874,7 @@
653
  },
654
  {
655
  "cell_type": "code",
656
- "execution_count": null,
657
  "id": "0ae3e9af-97b7-4aa0-ae85-20b23b5bcb3a",
658
  "metadata": {},
659
  "outputs": [],
@@ -703,7 +924,7 @@
703
  },
704
  {
705
  "cell_type": "code",
706
- "execution_count": null,
707
  "id": "3ac16b62-b3c0-4c68-8f3d-9ecf471534b2",
708
  "metadata": {},
709
  "outputs": [],
@@ -732,10 +953,20 @@
732
  },
733
  {
734
  "cell_type": "code",
735
- "execution_count": null,
736
  "id": "d546d7fe-0543-479a-b708-2ebabec19493",
737
  "metadata": {},
738
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
739
  "source": [
740
  "from transformers import Seq2SeqTrainer\n",
741
  "\n",
@@ -761,10 +992,23 @@
761
  },
762
  {
763
  "cell_type": "code",
764
- "execution_count": null,
765
  "id": "a1ccb9ed-cbc8-4419-91c0-651e9424b672",
766
  "metadata": {},
767
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
768
  "source": [
769
  "model.save_pretrained(training_args.output_dir)\n",
770
  "processor.save_pretrained(training_args.output_dir)"
@@ -797,7 +1041,54 @@
797
  "execution_count": null,
798
  "id": "ee8b7b8e-1c9a-4d77-9137-1778a629e6de",
799
  "metadata": {},
800
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
801
  "source": [
802
  "trainer.train()"
803
  ]
@@ -824,7 +1115,7 @@
824
  },
825
  {
826
  "cell_type": "code",
827
- "execution_count": null,
828
  "id": "6dd0e310-9b07-4133-ac14-2ed2d7524e22",
829
  "metadata": {},
830
  "outputs": [],
@@ -832,8 +1123,8 @@
832
  "kwargs = {\n",
833
  " \"dataset_tags\": \"mozilla-foundation/common_voice_11_0\",\n",
834
  " \"dataset\": \"Common Voice 11.0\", # a 'pretty' name for the training dataset\n",
835
- " \"language\": \"es\",\n",
836
- " \"model_name\": \"Whisper Small Es - Sanchit Gandhi\", # a 'pretty' name for your model\n",
837
  " \"finetuned_from\": \"openai/whisper-small\",\n",
838
  " \"tasks\": \"automatic-speech-recognition\",\n",
839
  " \"tags\": \"whisper-event\",\n",
@@ -850,10 +1141,100 @@
850
  },
851
  {
852
  "cell_type": "code",
853
- "execution_count": null,
854
  "id": "95737cda-c5dd-4887-a4d0-dfcb0d61d977",
855
  "metadata": {},
856
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
857
  "source": [
858
  "trainer.push_to_hub(**kwargs)"
859
  ]
@@ -875,7 +1256,7 @@
875
  "name": "python",
876
  "nbconvert_exporter": "python",
877
  "pygments_lexer": "ipython3",
878
- "version": "3.8.9"
879
  }
880
  },
881
  "nbformat": 4,
 
108
  },
109
  {
110
  "cell_type": "code",
111
+ "execution_count": 1,
112
  "id": "065a8cf7-e54f-4ac3-900e-609c80714fca",
113
  "metadata": {},
114
  "outputs": [],
 
142
  },
143
  {
144
  "cell_type": "code",
145
+ "execution_count": 2,
146
  "id": "a2787582-554f-44ce-9f38-4180a5ed6b44",
147
  "metadata": {},
148
+ "outputs": [
149
+ {
150
+ "data": {
151
+ "application/vnd.jupyter.widget-view+json": {
152
+ "model_id": "ecce3a630cdb4ebab217a88a0163b257",
153
+ "version_major": 2,
154
+ "version_minor": 0
155
+ },
156
+ "text/plain": [
157
+ "Downloading builder script: 0%| | 0.00/8.30k [00:00<?, ?B/s]"
158
+ ]
159
+ },
160
+ "metadata": {},
161
+ "output_type": "display_data"
162
+ },
163
+ {
164
+ "data": {
165
+ "application/vnd.jupyter.widget-view+json": {
166
+ "model_id": "b0141b068f944775867034bc494f88d7",
167
+ "version_major": 2,
168
+ "version_minor": 0
169
+ },
170
+ "text/plain": [
171
+ "Downloading readme: 0%| | 0.00/12.2k [00:00<?, ?B/s]"
172
+ ]
173
+ },
174
+ "metadata": {},
175
+ "output_type": "display_data"
176
+ },
177
+ {
178
+ "data": {
179
+ "application/vnd.jupyter.widget-view+json": {
180
+ "model_id": "9dd1f4ded47c4160b55f1bcedce2694f",
181
+ "version_major": 2,
182
+ "version_minor": 0
183
+ },
184
+ "text/plain": [
185
+ "Downloading extra modules: 0%| | 0.00/3.44k [00:00<?, ?B/s]"
186
+ ]
187
+ },
188
+ "metadata": {},
189
+ "output_type": "display_data"
190
+ },
191
+ {
192
+ "data": {
193
+ "application/vnd.jupyter.widget-view+json": {
194
+ "model_id": "a442da1e2a6b4271bae8ae0c655594b6",
195
+ "version_major": 2,
196
+ "version_minor": 0
197
+ },
198
+ "text/plain": [
199
+ "Downloading extra modules: 0%| | 0.00/60.9k [00:00<?, ?B/s]"
200
+ ]
201
+ },
202
+ "metadata": {},
203
+ "output_type": "display_data"
204
+ }
205
+ ],
206
  "source": [
207
  "from datasets import IterableDatasetDict\n",
208
  "\n",
209
  "raw_datasets = IterableDatasetDict()\n",
210
  "\n",
211
+ "raw_datasets[\"train\"] = load_streaming_dataset(\"mozilla-foundation/common_voice_11_0\", \"zh-TW\", split=\"train\", use_auth_token=True) # set split=\"train+validation\" for low-resource\n",
212
+ "raw_datasets[\"test\"] = load_streaming_dataset(\"mozilla-foundation/common_voice_11_0\", \"zh-TW\", split=\"test\", use_auth_token=True)"
213
  ]
214
  },
215
  {
 
242
  },
243
  {
244
  "cell_type": "code",
245
+ "execution_count": 3,
246
  "id": "77d9f0c5-8607-4642-a8ac-c3ab2e223ea6",
247
  "metadata": {},
248
+ "outputs": [
249
+ {
250
+ "data": {
251
+ "application/vnd.jupyter.widget-view+json": {
252
+ "model_id": "0d0c17f582474beebea009f021515946",
253
+ "version_major": 2,
254
+ "version_minor": 0
255
+ },
256
+ "text/plain": [
257
+ "Downloading: 0%| | 0.00/185k [00:00<?, ?B/s]"
258
+ ]
259
+ },
260
+ "metadata": {},
261
+ "output_type": "display_data"
262
+ },
263
+ {
264
+ "data": {
265
+ "application/vnd.jupyter.widget-view+json": {
266
+ "model_id": "9f48049fe65c4045ba74c6fac892945e",
267
+ "version_major": 2,
268
+ "version_minor": 0
269
+ },
270
+ "text/plain": [
271
+ "Downloading: 0%| | 0.00/829 [00:00<?, ?B/s]"
272
+ ]
273
+ },
274
+ "metadata": {},
275
+ "output_type": "display_data"
276
+ },
277
+ {
278
+ "data": {
279
+ "application/vnd.jupyter.widget-view+json": {
280
+ "model_id": "25615259dd364494bc5782b4e8231b05",
281
+ "version_major": 2,
282
+ "version_minor": 0
283
+ },
284
+ "text/plain": [
285
+ "Downloading: 0%| | 0.00/1.04M [00:00<?, ?B/s]"
286
+ ]
287
+ },
288
+ "metadata": {},
289
+ "output_type": "display_data"
290
+ },
291
+ {
292
+ "data": {
293
+ "application/vnd.jupyter.widget-view+json": {
294
+ "model_id": "6867564094bf4c7d82d0046dccb173fe",
295
+ "version_major": 2,
296
+ "version_minor": 0
297
+ },
298
+ "text/plain": [
299
+ "Downloading: 0%| | 0.00/494k [00:00<?, ?B/s]"
300
+ ]
301
+ },
302
+ "metadata": {},
303
+ "output_type": "display_data"
304
+ },
305
+ {
306
+ "data": {
307
+ "application/vnd.jupyter.widget-view+json": {
308
+ "model_id": "2cb3be77451542868602317c4d7eff85",
309
+ "version_major": 2,
310
+ "version_minor": 0
311
+ },
312
+ "text/plain": [
313
+ "Downloading: 0%| | 0.00/52.7k [00:00<?, ?B/s]"
314
+ ]
315
+ },
316
+ "metadata": {},
317
+ "output_type": "display_data"
318
+ },
319
+ {
320
+ "data": {
321
+ "application/vnd.jupyter.widget-view+json": {
322
+ "model_id": "6dfc5dedce13459bbac6f2d695695ae0",
323
+ "version_major": 2,
324
+ "version_minor": 0
325
+ },
326
+ "text/plain": [
327
+ "Downloading: 0%| | 0.00/2.11k [00:00<?, ?B/s]"
328
+ ]
329
+ },
330
+ "metadata": {},
331
+ "output_type": "display_data"
332
+ },
333
+ {
334
+ "data": {
335
+ "application/vnd.jupyter.widget-view+json": {
336
+ "model_id": "944cb945f9dd47178ab22d418aa2934b",
337
+ "version_major": 2,
338
+ "version_minor": 0
339
+ },
340
+ "text/plain": [
341
+ "Downloading: 0%| | 0.00/2.06k [00:00<?, ?B/s]"
342
+ ]
343
+ },
344
+ "metadata": {},
345
+ "output_type": "display_data"
346
+ }
347
+ ],
348
  "source": [
349
  "from transformers import WhisperProcessor\n",
350
  "\n",
351
+ "processor = WhisperProcessor.from_pretrained(\"openai/whisper-small\", language=\"chinese\", task=\"transcribe\")"
352
  ]
353
  },
354
  {
 
369
  },
370
  {
371
  "cell_type": "code",
372
+ "execution_count": 4,
373
  "id": "ab5a13b4-9bd4-4aa0-aef2-b3de9b762988",
374
  "metadata": {},
375
+ "outputs": [
376
+ {
377
+ "data": {
378
+ "text/plain": [
379
+ "{'client_id': Value(dtype='string', id=None),\n",
380
+ " 'path': Value(dtype='string', id=None),\n",
381
+ " 'audio': Audio(sampling_rate=48000, mono=True, decode=True, id=None),\n",
382
+ " 'sentence': Value(dtype='string', id=None),\n",
383
+ " 'up_votes': Value(dtype='int64', id=None),\n",
384
+ " 'down_votes': Value(dtype='int64', id=None),\n",
385
+ " 'age': Value(dtype='string', id=None),\n",
386
+ " 'gender': Value(dtype='string', id=None),\n",
387
+ " 'accent': Value(dtype='string', id=None),\n",
388
+ " 'locale': Value(dtype='string', id=None),\n",
389
+ " 'segment': Value(dtype='string', id=None)}"
390
+ ]
391
+ },
392
+ "execution_count": 4,
393
+ "metadata": {},
394
+ "output_type": "execute_result"
395
+ }
396
+ ],
397
  "source": [
398
  "raw_datasets[\"train\"].features"
399
  ]
 
415
  },
416
  {
417
  "cell_type": "code",
418
+ "execution_count": 5,
419
  "id": "3ab6a724-3d1e-478b-a9e9-d2f85feb6c39",
420
  "metadata": {},
421
  "outputs": [],
 
435
  },
436
  {
437
  "cell_type": "code",
438
+ "execution_count": 6,
439
  "id": "d041650e-1c48-4439-87b3-5b6f4a514107",
440
  "metadata": {},
441
  "outputs": [],
 
462
  },
463
  {
464
  "cell_type": "code",
465
+ "execution_count": 7,
466
  "id": "c085911c-a10a-41ef-8874-306e0503e9bb",
467
  "metadata": {},
468
  "outputs": [],
 
498
  },
499
  {
500
  "cell_type": "code",
501
+ "execution_count": 8,
502
  "id": "a37a7cdb-9013-427f-8de9-6a8d0e9dc684",
503
  "metadata": {},
504
  "outputs": [],
 
516
  },
517
  {
518
  "cell_type": "code",
519
+ "execution_count": 9,
520
  "id": "1b145699-acfc-4b1d-93a2-a2ad3d62674c",
521
  "metadata": {},
522
  "outputs": [],
 
537
  },
538
  {
539
  "cell_type": "code",
540
+ "execution_count": 10,
541
  "id": "01cb25ef-4bb0-4325-9461-f59198acadf6",
542
  "metadata": {},
543
  "outputs": [],
 
558
  },
559
  {
560
  "cell_type": "code",
561
+ "execution_count": 11,
562
  "id": "333f7f6e-6053-4d3b-8924-c733c79b82ac",
563
  "metadata": {},
564
  "outputs": [],
 
628
  },
629
  {
630
  "cell_type": "code",
631
+ "execution_count": 12,
632
  "id": "8326221e-ec13-4731-bb4e-51e5fc1486c5",
633
  "metadata": {},
634
  "outputs": [],
 
676
  },
677
  {
678
  "cell_type": "code",
679
+ "execution_count": 13,
680
  "id": "fc834702-c0d3-4a96-b101-7b87be32bf42",
681
  "metadata": {},
682
  "outputs": [],
 
703
  },
704
  {
705
  "cell_type": "code",
706
+ "execution_count": 14,
707
  "id": "b22b4011-f31f-4b57-b684-c52332f92890",
708
  "metadata": {},
709
+ "outputs": [
710
+ {
711
+ "data": {
712
+ "application/vnd.jupyter.widget-view+json": {
713
+ "model_id": "bafc0b31fe9a4d239eedc348d5521dfc",
714
+ "version_major": 2,
715
+ "version_minor": 0
716
+ },
717
+ "text/plain": [
718
+ "Downloading builder script: 0%| | 0.00/4.49k [00:00<?, ?B/s]"
719
+ ]
720
+ },
721
+ "metadata": {},
722
+ "output_type": "display_data"
723
+ }
724
+ ],
725
  "source": [
726
  "import evaluate\n",
727
  "\n",
 
747
  },
748
  {
749
  "cell_type": "code",
750
+ "execution_count": 15,
751
  "id": "a11d1bfc-9e28-460f-a287-72d8f7bc1acb",
752
  "metadata": {},
753
  "outputs": [],
 
797
  },
798
  {
799
  "cell_type": "code",
800
+ "execution_count": 16,
801
  "id": "5a10cc4b-07ec-4ebd-ac1d-7c601023594f",
802
  "metadata": {},
803
+ "outputs": [
804
+ {
805
+ "data": {
806
+ "application/vnd.jupyter.widget-view+json": {
807
+ "model_id": "e1d5d79e596a416aa96bde21be6fb551",
808
+ "version_major": 2,
809
+ "version_minor": 0
810
+ },
811
+ "text/plain": [
812
+ "Downloading: 0%| | 0.00/1.97k [00:00<?, ?B/s]"
813
+ ]
814
+ },
815
+ "metadata": {},
816
+ "output_type": "display_data"
817
+ },
818
+ {
819
+ "data": {
820
+ "application/vnd.jupyter.widget-view+json": {
821
+ "model_id": "3d722a61d7a440479d0f5497a6200345",
822
+ "version_major": 2,
823
+ "version_minor": 0
824
+ },
825
+ "text/plain": [
826
+ "Downloading: 0%| | 0.00/967M [00:00<?, ?B/s]"
827
+ ]
828
+ },
829
+ "metadata": {},
830
+ "output_type": "display_data"
831
+ }
832
+ ],
833
  "source": [
834
  "from transformers import WhisperForConditionalGeneration\n",
835
  "\n",
 
846
  },
847
  {
848
  "cell_type": "code",
849
+ "execution_count": 17,
850
  "id": "62038ba3-88ed-4fce-84db-338f50dcd04f",
851
  "metadata": {},
852
  "outputs": [],
 
874
  },
875
  {
876
  "cell_type": "code",
877
+ "execution_count": 18,
878
  "id": "0ae3e9af-97b7-4aa0-ae85-20b23b5bcb3a",
879
  "metadata": {},
880
  "outputs": [],
 
924
  },
925
  {
926
  "cell_type": "code",
927
+ "execution_count": 19,
928
  "id": "3ac16b62-b3c0-4c68-8f3d-9ecf471534b2",
929
  "metadata": {},
930
  "outputs": [],
 
953
  },
954
  {
955
  "cell_type": "code",
956
+ "execution_count": 20,
957
  "id": "d546d7fe-0543-479a-b708-2ebabec19493",
958
  "metadata": {},
959
+ "outputs": [
960
+ {
961
+ "name": "stderr",
962
+ "output_type": "stream",
963
+ "text": [
964
+ "/home/ubuntu/whisper-small-zh-tw/./ is already a clone of https://huggingface.co/kimbochen/whisper-small-zh-tw. Make sure you pull the latest changes with `repo.git_pull()`.\n",
965
+ "max_steps is given, it will override any value given in num_train_epochs\n",
966
+ "Using cuda_amp half precision backend\n"
967
+ ]
968
+ }
969
+ ],
970
  "source": [
971
  "from transformers import Seq2SeqTrainer\n",
972
  "\n",
 
992
  },
993
  {
994
  "cell_type": "code",
995
+ "execution_count": 21,
996
  "id": "a1ccb9ed-cbc8-4419-91c0-651e9424b672",
997
  "metadata": {},
998
+ "outputs": [
999
+ {
1000
+ "name": "stderr",
1001
+ "output_type": "stream",
1002
+ "text": [
1003
+ "Configuration saved in ./config.json\n",
1004
+ "Model weights saved in ./pytorch_model.bin\n",
1005
+ "Feature extractor saved in ./preprocessor_config.json\n",
1006
+ "tokenizer config file saved in ./tokenizer_config.json\n",
1007
+ "Special tokens file saved in ./special_tokens_map.json\n",
1008
+ "added tokens file saved in ./added_tokens.json\n"
1009
+ ]
1010
+ }
1011
+ ],
1012
  "source": [
1013
  "model.save_pretrained(training_args.output_dir)\n",
1014
  "processor.save_pretrained(training_args.output_dir)"
 
1041
  "execution_count": null,
1042
  "id": "ee8b7b8e-1c9a-4d77-9137-1778a629e6de",
1043
  "metadata": {},
1044
+ "outputs": [
1045
+ {
1046
+ "name": "stderr",
1047
+ "output_type": "stream",
1048
+ "text": [
1049
+ "/home/ubuntu/.venv/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
1050
+ " warnings.warn(\n",
1051
+ "***** Running training *****\n",
1052
+ " Num examples = 320000\n",
1053
+ " Num Epochs = 9223372036854775807\n",
1054
+ " Instantaneous batch size per device = 64\n",
1055
+ " Total train batch size (w. parallel, distributed & accumulation) = 64\n",
1056
+ " Gradient Accumulation steps = 1\n",
1057
+ " Total optimization steps = 5000\n",
1058
+ " Number of trainable parameters = 241734912\n",
1059
+ "Reading metadata...: 6568it [00:00, 41540.60it/s]\n",
1060
+ "The following columns in the training set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: input_length. If input_length are not expected by `WhisperForConditionalGeneration.forward`, you can safely ignore this message.\n"
1061
+ ]
1062
+ },
1063
+ {
1064
+ "data": {
1065
+ "text/html": [
1066
+ "\n",
1067
+ " <div>\n",
1068
+ " \n",
1069
+ " <progress value='29' max='5000' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
1070
+ " [ 29/5000 02:50 < 8:42:35, 0.16 it/s, Epoch 0.01/9223372036854775807]\n",
1071
+ " </div>\n",
1072
+ " <table border=\"1\" class=\"dataframe\">\n",
1073
+ " <thead>\n",
1074
+ " <tr style=\"text-align: left;\">\n",
1075
+ " <th>Step</th>\n",
1076
+ " <th>Training Loss</th>\n",
1077
+ " <th>Validation Loss</th>\n",
1078
+ " </tr>\n",
1079
+ " </thead>\n",
1080
+ " <tbody>\n",
1081
+ " </tbody>\n",
1082
+ "</table><p>"
1083
+ ],
1084
+ "text/plain": [
1085
+ "<IPython.core.display.HTML object>"
1086
+ ]
1087
+ },
1088
+ "metadata": {},
1089
+ "output_type": "display_data"
1090
+ }
1091
+ ],
1092
  "source": [
1093
  "trainer.train()"
1094
  ]
 
1115
  },
1116
  {
1117
  "cell_type": "code",
1118
+ "execution_count": 22,
1119
  "id": "6dd0e310-9b07-4133-ac14-2ed2d7524e22",
1120
  "metadata": {},
1121
  "outputs": [],
 
1123
  "kwargs = {\n",
1124
  " \"dataset_tags\": \"mozilla-foundation/common_voice_11_0\",\n",
1125
  " \"dataset\": \"Common Voice 11.0\", # a 'pretty' name for the training dataset\n",
1126
+ " \"language\": \"zh-TW\",\n",
1127
+ " \"model_name\": \"Whisper Small Chinese - Kimbo Chen\", # a 'pretty' name for your model\n",
1128
  " \"finetuned_from\": \"openai/whisper-small\",\n",
1129
  " \"tasks\": \"automatic-speech-recognition\",\n",
1130
  " \"tags\": \"whisper-event\",\n",
 
1141
  },
1142
  {
1143
  "cell_type": "code",
1144
+ "execution_count": 23,
1145
  "id": "95737cda-c5dd-4887-a4d0-dfcb0d61d977",
1146
  "metadata": {},
1147
+ "outputs": [
1148
+ {
1149
+ "name": "stderr",
1150
+ "output_type": "stream",
1151
+ "text": [
1152
+ "Saving model checkpoint to ./\n",
1153
+ "Configuration saved in ./config.json\n",
1154
+ "Model weights saved in ./pytorch_model.bin\n",
1155
+ "Feature extractor saved in ./preprocessor_config.json\n",
1156
+ "tokenizer config file saved in ./tokenizer_config.json\n",
1157
+ "Special tokens file saved in ./special_tokens_map.json\n",
1158
+ "added tokens file saved in ./added_tokens.json\n"
1159
+ ]
1160
+ },
1161
+ {
1162
+ "data": {
1163
+ "application/vnd.jupyter.widget-view+json": {
1164
+ "model_id": "dc59052a3b7f45b2b896c03763c79f57",
1165
+ "version_major": 2,
1166
+ "version_minor": 0
1167
+ },
1168
+ "text/plain": [
1169
+ "Upload file pytorch_model.bin: 0%| | 32.0k/922M [00:00<?, ?B/s]"
1170
+ ]
1171
+ },
1172
+ "metadata": {},
1173
+ "output_type": "display_data"
1174
+ },
1175
+ {
1176
+ "data": {
1177
+ "application/vnd.jupyter.widget-view+json": {
1178
+ "model_id": "1c58442a44e84af9a6dff915e036de83",
1179
+ "version_major": 2,
1180
+ "version_minor": 0
1181
+ },
1182
+ "text/plain": [
1183
+ "Upload file training_args.bin: 100%|##########| 3.50k/3.50k [00:00<?, ?B/s]"
1184
+ ]
1185
+ },
1186
+ "metadata": {},
1187
+ "output_type": "display_data"
1188
+ },
1189
+ {
1190
+ "name": "stderr",
1191
+ "output_type": "stream",
1192
+ "text": [
1193
+ "remote: Scanning LFS files for validity, may be slow... \n",
1194
+ "remote: LFS file scan complete. \n",
1195
+ "To https://huggingface.co/kimbochen/whisper-small-zh-tw\n",
1196
+ " 2ee4cf3..214645d main -> main\n",
1197
+ "\n",
1198
+ "Dropping the following result as it does not have all the necessary fields:\n",
1199
+ "{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}, 'dataset': {'name': 'Common Voice 11.0', 'type': 'mozilla-foundation/common_voice_11_0', 'config': 'zh-TW', 'split': 'test', 'args': 'zh-TW'}}\n",
1200
+ "remote: ----------------------------------------------------------\u001b[0;31m \n",
1201
+ "remote: Sorry, your push was rejected during YAML metadata verification: \n",
1202
+ "remote: - Error: \"language[0]\" must only contain lowercase characters \n",
1203
+ "remote: - Error: \"language[0]\" with value \"zh-TW\" is not valid. It must be an ISO 639-1, 639-2 or 639-3 code (two/three letters), or a special value like \"code\", \"multilingual\". If you want to use BCP-47 identifiers, you can specify them in language_bcp47.\u001b[0;32m \n",
1204
+ "remote: ---------------------------------------------------------- \n",
1205
+ "remote: Please find the documentation at: \n",
1206
+ "remote: https://huggingface.co/docs/hub/model-cards#model-card-metadata\u001b[0;0m \n",
1207
+ "remote: ---------------------------------------------------------- \n",
1208
+ "To https://huggingface.co/kimbochen/whisper-small-zh-tw\n",
1209
+ " ! [remote rejected] main -> main (pre-receive hook declined)\n",
1210
+ "error: failed to push some refs to 'https://huggingface.co/kimbochen/whisper-small-zh-tw'\n",
1211
+ "\n",
1212
+ "Error pushing update to the model card. Please read logs and retry.\n",
1213
+ "$remote: ----------------------------------------------------------\u001b[0;31m \n",
1214
+ "remote: Sorry, your push was rejected during YAML metadata verification: \n",
1215
+ "remote: - Error: \"language[0]\" must only contain lowercase characters \n",
1216
+ "remote: - Error: \"language[0]\" with value \"zh-TW\" is not valid. It must be an ISO 639-1, 639-2 or 639-3 code (two/three letters), or a special value like \"code\", \"multilingual\". If you want to use BCP-47 identifiers, you can specify them in language_bcp47.\u001b[0;32m \n",
1217
+ "remote: ---------------------------------------------------------- \n",
1218
+ "remote: Please find the documentation at: \n",
1219
+ "remote: https://huggingface.co/docs/hub/model-cards#model-card-metadata\u001b[0;0m \n",
1220
+ "remote: ---------------------------------------------------------- \n",
1221
+ "To https://huggingface.co/kimbochen/whisper-small-zh-tw\n",
1222
+ " ! [remote rejected] main -> main (pre-receive hook declined)\n",
1223
+ "error: failed to push some refs to 'https://huggingface.co/kimbochen/whisper-small-zh-tw'\n",
1224
+ "\n"
1225
+ ]
1226
+ },
1227
+ {
1228
+ "data": {
1229
+ "text/plain": [
1230
+ "'https://huggingface.co/kimbochen/whisper-small-zh-tw/commit/214645d6cd1f0e7ab6a65a854eec2e349529961c'"
1231
+ ]
1232
+ },
1233
+ "execution_count": 23,
1234
+ "metadata": {},
1235
+ "output_type": "execute_result"
1236
+ }
1237
+ ],
1238
  "source": [
1239
  "trainer.push_to_hub(**kwargs)"
1240
  ]
 
1256
  "name": "python",
1257
  "nbconvert_exporter": "python",
1258
  "pygments_lexer": "ipython3",
1259
+ "version": "3.8.10"
1260
  }
1261
  },
1262
  "nbformat": 4,
fine-tune-whisper-streaming.ipynb CHANGED
@@ -1041,7 +1041,54 @@
1041
  "execution_count": null,
1042
  "id": "ee8b7b8e-1c9a-4d77-9137-1778a629e6de",
1043
  "metadata": {},
1044
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1045
  "source": [
1046
  "trainer.train()"
1047
  ]
@@ -1068,7 +1115,7 @@
1068
  },
1069
  {
1070
  "cell_type": "code",
1071
- "execution_count": null,
1072
  "id": "6dd0e310-9b07-4133-ac14-2ed2d7524e22",
1073
  "metadata": {},
1074
  "outputs": [],
@@ -1076,7 +1123,7 @@
1076
  "kwargs = {\n",
1077
  " \"dataset_tags\": \"mozilla-foundation/common_voice_11_0\",\n",
1078
  " \"dataset\": \"Common Voice 11.0\", # a 'pretty' name for the training dataset\n",
1079
- " \"language\": \"\",\n",
1080
  " \"model_name\": \"Whisper Small Chinese - Kimbo Chen\", # a 'pretty' name for your model\n",
1081
  " \"finetuned_from\": \"openai/whisper-small\",\n",
1082
  " \"tasks\": \"automatic-speech-recognition\",\n",
@@ -1094,10 +1141,100 @@
1094
  },
1095
  {
1096
  "cell_type": "code",
1097
- "execution_count": null,
1098
  "id": "95737cda-c5dd-4887-a4d0-dfcb0d61d977",
1099
  "metadata": {},
1100
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1101
  "source": [
1102
  "trainer.push_to_hub(**kwargs)"
1103
  ]
 
1041
  "execution_count": null,
1042
  "id": "ee8b7b8e-1c9a-4d77-9137-1778a629e6de",
1043
  "metadata": {},
1044
+ "outputs": [
1045
+ {
1046
+ "name": "stderr",
1047
+ "output_type": "stream",
1048
+ "text": [
1049
+ "/home/ubuntu/.venv/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
1050
+ " warnings.warn(\n",
1051
+ "***** Running training *****\n",
1052
+ " Num examples = 320000\n",
1053
+ " Num Epochs = 9223372036854775807\n",
1054
+ " Instantaneous batch size per device = 64\n",
1055
+ " Total train batch size (w. parallel, distributed & accumulation) = 64\n",
1056
+ " Gradient Accumulation steps = 1\n",
1057
+ " Total optimization steps = 5000\n",
1058
+ " Number of trainable parameters = 241734912\n",
1059
+ "Reading metadata...: 6568it [00:00, 41540.60it/s]\n",
1060
+ "The following columns in the training set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: input_length. If input_length are not expected by `WhisperForConditionalGeneration.forward`, you can safely ignore this message.\n"
1061
+ ]
1062
+ },
1063
+ {
1064
+ "data": {
1065
+ "text/html": [
1066
+ "\n",
1067
+ " <div>\n",
1068
+ " \n",
1069
+ " <progress value='35' max='5000' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
1070
+ " [ 35/5000 03:29 < 8:46:02, 0.16 it/s, Epoch 0.01/9223372036854775807]\n",
1071
+ " </div>\n",
1072
+ " <table border=\"1\" class=\"dataframe\">\n",
1073
+ " <thead>\n",
1074
+ " <tr style=\"text-align: left;\">\n",
1075
+ " <th>Step</th>\n",
1076
+ " <th>Training Loss</th>\n",
1077
+ " <th>Validation Loss</th>\n",
1078
+ " </tr>\n",
1079
+ " </thead>\n",
1080
+ " <tbody>\n",
1081
+ " </tbody>\n",
1082
+ "</table><p>"
1083
+ ],
1084
+ "text/plain": [
1085
+ "<IPython.core.display.HTML object>"
1086
+ ]
1087
+ },
1088
+ "metadata": {},
1089
+ "output_type": "display_data"
1090
+ }
1091
+ ],
1092
  "source": [
1093
  "trainer.train()"
1094
  ]
 
1115
  },
1116
  {
1117
  "cell_type": "code",
1118
+ "execution_count": 22,
1119
  "id": "6dd0e310-9b07-4133-ac14-2ed2d7524e22",
1120
  "metadata": {},
1121
  "outputs": [],
 
1123
  "kwargs = {\n",
1124
  " \"dataset_tags\": \"mozilla-foundation/common_voice_11_0\",\n",
1125
  " \"dataset\": \"Common Voice 11.0\", # a 'pretty' name for the training dataset\n",
1126
+ " \"language\": \"zh-TW\",\n",
1127
  " \"model_name\": \"Whisper Small Chinese - Kimbo Chen\", # a 'pretty' name for your model\n",
1128
  " \"finetuned_from\": \"openai/whisper-small\",\n",
1129
  " \"tasks\": \"automatic-speech-recognition\",\n",
 
1141
  },
1142
  {
1143
  "cell_type": "code",
1144
+ "execution_count": 23,
1145
  "id": "95737cda-c5dd-4887-a4d0-dfcb0d61d977",
1146
  "metadata": {},
1147
+ "outputs": [
1148
+ {
1149
+ "name": "stderr",
1150
+ "output_type": "stream",
1151
+ "text": [
1152
+ "Saving model checkpoint to ./\n",
1153
+ "Configuration saved in ./config.json\n",
1154
+ "Model weights saved in ./pytorch_model.bin\n",
1155
+ "Feature extractor saved in ./preprocessor_config.json\n",
1156
+ "tokenizer config file saved in ./tokenizer_config.json\n",
1157
+ "Special tokens file saved in ./special_tokens_map.json\n",
1158
+ "added tokens file saved in ./added_tokens.json\n"
1159
+ ]
1160
+ },
1161
+ {
1162
+ "data": {
1163
+ "application/vnd.jupyter.widget-view+json": {
1164
+ "model_id": "dc59052a3b7f45b2b896c03763c79f57",
1165
+ "version_major": 2,
1166
+ "version_minor": 0
1167
+ },
1168
+ "text/plain": [
1169
+ "Upload file pytorch_model.bin: 0%| | 32.0k/922M [00:00<?, ?B/s]"
1170
+ ]
1171
+ },
1172
+ "metadata": {},
1173
+ "output_type": "display_data"
1174
+ },
1175
+ {
1176
+ "data": {
1177
+ "application/vnd.jupyter.widget-view+json": {
1178
+ "model_id": "1c58442a44e84af9a6dff915e036de83",
1179
+ "version_major": 2,
1180
+ "version_minor": 0
1181
+ },
1182
+ "text/plain": [
1183
+ "Upload file training_args.bin: 100%|##########| 3.50k/3.50k [00:00<?, ?B/s]"
1184
+ ]
1185
+ },
1186
+ "metadata": {},
1187
+ "output_type": "display_data"
1188
+ },
1189
+ {
1190
+ "name": "stderr",
1191
+ "output_type": "stream",
1192
+ "text": [
1193
+ "remote: Scanning LFS files for validity, may be slow... \n",
1194
+ "remote: LFS file scan complete. \n",
1195
+ "To https://huggingface.co/kimbochen/whisper-small-zh-tw\n",
1196
+ " 2ee4cf3..214645d main -> main\n",
1197
+ "\n",
1198
+ "Dropping the following result as it does not have all the necessary fields:\n",
1199
+ "{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}, 'dataset': {'name': 'Common Voice 11.0', 'type': 'mozilla-foundation/common_voice_11_0', 'config': 'zh-TW', 'split': 'test', 'args': 'zh-TW'}}\n",
1200
+ "remote: ----------------------------------------------------------\u001b[0;31m \n",
1201
+ "remote: Sorry, your push was rejected during YAML metadata verification: \n",
1202
+ "remote: - Error: \"language[0]\" must only contain lowercase characters \n",
1203
+ "remote: - Error: \"language[0]\" with value \"zh-TW\" is not valid. It must be an ISO 639-1, 639-2 or 639-3 code (two/three letters), or a special value like \"code\", \"multilingual\". If you want to use BCP-47 identifiers, you can specify them in language_bcp47.\u001b[0;32m \n",
1204
+ "remote: ---------------------------------------------------------- \n",
1205
+ "remote: Please find the documentation at: \n",
1206
+ "remote: https://huggingface.co/docs/hub/model-cards#model-card-metadata\u001b[0;0m \n",
1207
+ "remote: ---------------------------------------------------------- \n",
1208
+ "To https://huggingface.co/kimbochen/whisper-small-zh-tw\n",
1209
+ " ! [remote rejected] main -> main (pre-receive hook declined)\n",
1210
+ "error: failed to push some refs to 'https://huggingface.co/kimbochen/whisper-small-zh-tw'\n",
1211
+ "\n",
1212
+ "Error pushing update to the model card. Please read logs and retry.\n",
1213
+ "$remote: ----------------------------------------------------------\u001b[0;31m \n",
1214
+ "remote: Sorry, your push was rejected during YAML metadata verification: \n",
1215
+ "remote: - Error: \"language[0]\" must only contain lowercase characters \n",
1216
+ "remote: - Error: \"language[0]\" with value \"zh-TW\" is not valid. It must be an ISO 639-1, 639-2 or 639-3 code (two/three letters), or a special value like \"code\", \"multilingual\". If you want to use BCP-47 identifiers, you can specify them in language_bcp47.\u001b[0;32m \n",
1217
+ "remote: ---------------------------------------------------------- \n",
1218
+ "remote: Please find the documentation at: \n",
1219
+ "remote: https://huggingface.co/docs/hub/model-cards#model-card-metadata\u001b[0;0m \n",
1220
+ "remote: ---------------------------------------------------------- \n",
1221
+ "To https://huggingface.co/kimbochen/whisper-small-zh-tw\n",
1222
+ " ! [remote rejected] main -> main (pre-receive hook declined)\n",
1223
+ "error: failed to push some refs to 'https://huggingface.co/kimbochen/whisper-small-zh-tw'\n",
1224
+ "\n"
1225
+ ]
1226
+ },
1227
+ {
1228
+ "data": {
1229
+ "text/plain": [
1230
+ "'https://huggingface.co/kimbochen/whisper-small-zh-tw/commit/214645d6cd1f0e7ab6a65a854eec2e349529961c'"
1231
+ ]
1232
+ },
1233
+ "execution_count": 23,
1234
+ "metadata": {},
1235
+ "output_type": "execute_result"
1236
+ }
1237
+ ],
1238
  "source": [
1239
  "trainer.push_to_hub(**kwargs)"
1240
  ]
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f0fc1b0188915501fc1066b2932bcedbe557ab656231371b7ea5278a28d488d6
3
  size 967102601
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68c37aa36016265b630dfcf67b6593ca65cefa6c6e939ab9dd790e2b04c9b56f
3
  size 967102601
runs/Dec10_02-58-52_129-213-89-27/1670641248.2035987/events.out.tfevents.1670641248.129-213-89-27.128858.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d7d176987bf05d49e50c322906f78e49182290133c788cbef513dd25194be99
3
+ size 5863
runs/Dec10_02-58-52_129-213-89-27/events.out.tfevents.1670641248.129-213-89-27.128858.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57031bef3a05b71c381e0b0d76e9378fdb1bb7a416a15f96b5296653a4f5bb53
3
+ size 10869