gmihaila commited on
Commit
a005937
1 Parent(s): 7e31ae3

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +37 -129
README.md CHANGED
@@ -1,217 +1,125 @@
1
  ---
2
- 2
3
  language: ro
4
- 3
5
  datasets:
6
- 4
7
  - common_voice
8
- 5
9
  tags:
10
- 6
11
  - audio
12
- 7
13
  - automatic-speech-recognition
14
- 8
15
  - speech
16
- 9
17
  - xlsr-fine-tuning-week
18
- 10
19
  license: apache-2.0
20
- 11
21
  model-index:
22
- 12
23
  - name: XLSR Wav2Vec2 Romanian by George Mihaila
24
- 13
25
  results:
26
- 14
27
  - task:
28
- 15
29
  name: Speech Recognition
30
- 16
31
  type: automatic-speech-recognition
32
- 17
33
  dataset:
34
- 18
35
  name: Common Voice ro
36
- 19
37
  type: common_voice
38
- 20
39
  args: {lang_id}
40
- 21
41
  metrics:
42
- 22
43
  - name: Test WER
44
- 23
45
  type: wer
46
- 24
47
- value: 37.1
48
- 25
49
  ---
50
- 26
51
- 27
52
- # Wav2Vec2-Large-XLSR-53-Turkish
53
- 28
54
- 29
55
  Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) in Romanian using the [Common Voice](https://huggingface.co/datasets/common_voice)
56
- 30
57
  When using this model, make sure that your speech input is sampled at 16kHz.
58
- 31
59
- 32
60
  ## Usage
61
- 33
62
- 34
63
  The model can be used directly (without a language model) as follows:
64
- 35
65
- 36
66
  ```python
67
- 37
68
  import torch
69
- 38
70
  import torchaudio
71
- 39
72
  from datasets import load_dataset
73
- 40
74
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
75
- 41
76
- 42
77
  test_dataset = load_dataset("common_voice", "ro", split="test[:2%]").
78
- 43
79
- 44
80
  processor = Wav2Vec2Processor.from_pretrained("gmihaila/wav2vec2-large-xlsr-53-romanian")
81
- 45
82
  model = Wav2Vec2ForCTC.from_pretrained("gmihaila/wav2vec2-large-xlsr-53-romanian")
83
- 46
84
- 47
85
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
86
- 48
87
- 49
88
  # Preprocessing the datasets.
89
- 50
90
  # We need to read the aduio files as arrays
91
- 51
92
  def speech_file_to_array_fn(batch):
93
- 52
94
  speech_array, sampling_rate = torchaudio.load(batch["path"])
95
- 53
96
  batch["speech"] = resampler(speech_array).squeeze().numpy()
97
- 54
98
  return batch
99
- 55
100
- 56
101
  test_dataset = test_dataset.map(speech_file_to_array_fn)
102
- 57
103
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
104
- 58
105
- 59
106
  with torch.no_grad():
107
- 60
108
  logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
109
- 61
110
- 62
111
  predicted_ids = torch.argmax(logits, dim=-1)
112
- 63
113
- 64
114
  print("Prediction:", processor.batch_decode(predicted_ids))
115
- 65
116
  print("Reference:", test_dataset["sentence"][:2])
117
- 66
118
  ```
119
- 67
120
- 68
121
- 69
122
  ## Evaluation
123
- 70
124
- 71
125
  The model can be evaluated as follows on the {language} test data of Common Voice.
126
- 72
127
- 73
128
- 74
129
  ```python
130
- 75
131
  import torch
132
- 76
133
  import torchaudio
134
- 77
135
  from datasets import load_dataset, load_metric
136
- 78
137
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
138
- 79
139
  import re
140
- 80
141
- 81
142
  test_dataset = load_dataset("common_voice", "ro", split="test")
143
- 82
144
  wer = load_metric("wer")
145
- 83
146
- 84
147
  processor = Wav2Vec2Processor.from_pretrained("gmihaila/wav2vec2-large-xlsr-53-romanian")
148
- 85
149
  model = Wav2Vec2ForCTC.from_pretrained("gmihaila/wav2vec2-large-xlsr-53-romanian")
150
- 86
151
  model.to("cuda")
152
- 87
153
- 88
154
- chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“]'
155
- 89
156
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
157
- 90
158
- 91
159
  # Preprocessing the datasets.
160
- 92
161
  # We need to read the aduio files as arrays
162
- 93
163
  def speech_file_to_array_fn(batch):
164
- 94
165
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
166
- 95
167
  speech_array, sampling_rate = torchaudio.load(batch["path"])
168
- 96
169
  batch["speech"] = resampler(speech_array).squeeze().numpy()
170
- 97
171
  return batch
172
- 98
173
- 99
174
  test_dataset = test_dataset.map(speech_file_to_array_fn)
175
- 100
176
- 101
177
  # Preprocessing the datasets.
178
- 102
179
  # We need to read the aduio files as arrays
180
- 103
181
  def evaluate(batch):
182
- 104
183
  inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
184
- 105
185
- 106
186
  with torch.no_grad():
187
- 107
188
  logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
189
- 108
190
- 109
191
  pred_ids = torch.argmax(logits, dim=-1)
192
- 110
193
  batch["pred_strings"] = processor.batch_decode(pred_ids)
194
- 111
195
  return batch
196
- 112
197
- 113
198
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
199
- 114
200
- 115
201
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
202
- 116
203
  ```
204
- 117
205
- 118
206
  **Test Result**: 37.10 %
207
- 119
208
- 120
209
- 121
210
  ## Training
211
- 122
212
- 123
213
  The Common Voice `train`, `validation` datasets were used for training.
214
- 124
215
- 125
216
- The script used for training can be found [here]()
217
- 126
 
1
  ---
 
2
  language: ro
 
3
  datasets:
 
4
  - common_voice
 
5
  tags:
 
6
  - audio
 
7
  - automatic-speech-recognition
 
8
  - speech
 
9
  - xlsr-fine-tuning-week
 
10
  license: apache-2.0
 
11
  model-index:
 
12
  - name: XLSR Wav2Vec2 Romanian by George Mihaila
 
13
  results:
 
14
  - task:
 
15
  name: Speech Recognition
 
16
  type: automatic-speech-recognition
 
17
  dataset:
 
18
  name: Common Voice ro
 
19
  type: common_voice
 
20
  args: {lang_id}
 
21
  metrics:
 
22
  - name: Test WER
 
23
  type: wer
24
+ value: 40.7
 
 
25
  ---
26
+
27
+ # Wav2Vec2-Large-XLSR-53-Romanian
28
+
 
 
29
  Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) in Romanian using the [Common Voice](https://huggingface.co/datasets/common_voice)
 
30
  When using this model, make sure that your speech input is sampled at 16kHz.
31
+
 
32
  ## Usage
33
+
 
34
  The model can be used directly (without a language model) as follows:
35
+
 
36
  ```python
 
37
  import torch
 
38
  import torchaudio
 
39
  from datasets import load_dataset
 
40
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
41
+
 
42
  test_dataset = load_dataset("common_voice", "ro", split="test[:2%]").
43
+
 
44
  processor = Wav2Vec2Processor.from_pretrained("gmihaila/wav2vec2-large-xlsr-53-romanian")
 
45
  model = Wav2Vec2ForCTC.from_pretrained("gmihaila/wav2vec2-large-xlsr-53-romanian")
46
+
 
47
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
48
+
 
49
  # Preprocessing the datasets.
 
50
  # We need to read the aduio files as arrays
 
51
  def speech_file_to_array_fn(batch):
 
52
  speech_array, sampling_rate = torchaudio.load(batch["path"])
 
53
  batch["speech"] = resampler(speech_array).squeeze().numpy()
 
54
  return batch
55
+
 
56
  test_dataset = test_dataset.map(speech_file_to_array_fn)
 
57
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
58
+
 
59
  with torch.no_grad():
 
60
  logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
61
+
 
62
  predicted_ids = torch.argmax(logits, dim=-1)
63
+
 
64
  print("Prediction:", processor.batch_decode(predicted_ids))
 
65
  print("Reference:", test_dataset["sentence"][:2])
 
66
  ```
67
+
68
+
 
69
  ## Evaluation
70
+
 
71
  The model can be evaluated as follows on the {language} test data of Common Voice.
72
+
73
+
 
74
  ```python
 
75
  import torch
 
76
  import torchaudio
 
77
  from datasets import load_dataset, load_metric
 
78
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 
79
  import re
80
+
 
81
  test_dataset = load_dataset("common_voice", "ro", split="test")
 
82
  wer = load_metric("wer")
83
+
 
84
  processor = Wav2Vec2Processor.from_pretrained("gmihaila/wav2vec2-large-xlsr-53-romanian")
 
85
  model = Wav2Vec2ForCTC.from_pretrained("gmihaila/wav2vec2-large-xlsr-53-romanian")
 
86
  model.to("cuda")
87
+
88
+ chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“]'
 
 
89
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
90
+
 
91
  # Preprocessing the datasets.
 
92
  # We need to read the aduio files as arrays
 
93
  def speech_file_to_array_fn(batch):
 
94
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
 
95
  speech_array, sampling_rate = torchaudio.load(batch["path"])
 
96
  batch["speech"] = resampler(speech_array).squeeze().numpy()
 
97
  return batch
98
+
 
99
  test_dataset = test_dataset.map(speech_file_to_array_fn)
100
+
 
101
  # Preprocessing the datasets.
 
102
  # We need to read the aduio files as arrays
 
103
  def evaluate(batch):
 
104
  inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
105
+
 
106
  with torch.no_grad():
 
107
  logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
108
+
 
109
  pred_ids = torch.argmax(logits, dim=-1)
 
110
  batch["pred_strings"] = processor.batch_decode(pred_ids)
 
111
  return batch
112
+
 
113
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
114
+
 
115
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
 
116
  ```
117
+
 
118
  **Test Result**: 37.10 %
119
+
120
+
 
121
  ## Training
122
+
 
123
  The Common Voice `train`, `validation` datasets were used for training.
124
+
125
+ The script used for training can be found [here]()