Updated model
Browse files- .ipynb_checkpoints/README-checkpoint.md +85 -7
- README.md +7 -7
- config.json +2 -2
- pytorch_model.bin +2 -2
- vocab.json +1 -1
.ipynb_checkpoints/README-checkpoint.md
CHANGED
@@ -29,9 +29,12 @@ results:
|
|
29 |
type: iiith
|
30 |
args: hi
|
31 |
metrics:
|
32 |
-
- name:
|
33 |
type: wer
|
34 |
-
value:
|
|
|
|
|
|
|
35 |
---
|
36 |
|
37 |
# Wav2Vec2-Large-XLSR-53-Hindi
|
@@ -41,11 +44,11 @@ Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav
|
|
41 |
- [Indic TTS- IITM](https://www.iitm.ac.in/donlab/tts/index.php) and
|
42 |
- [IIITH - Indic Speech Datasets](http://speech.iiit.ac.in/index.php/research-svl/69.html)
|
43 |
|
44 |
-
The
|
45 |
|
46 |
-
Fine-tuned on facebook/wav2vec2-large-xlsr-53 using Hindi dataset :: 30 epochs >> 19.05% WER
|
47 |
-
Resuming from checkpoints trained for another XX epochs >> XX.XX%
|
48 |
|
|
|
|
|
49 |
When using this model, make sure that your speech input is sampled at 16kHz.
|
50 |
|
51 |
## Usage
|
@@ -85,7 +88,81 @@ print("Reference:", test_dataset["sentence"][:2])
|
|
85 |
|
86 |
## Evaluation
|
87 |
|
88 |
-
The model can be evaluated as follows on the
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
|
91 |
```python
|
@@ -132,7 +209,8 @@ result = test_dataset.map(evaluate, batched=True, batch_size=8)
|
|
132 |
print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
|
133 |
```
|
134 |
|
135 |
-
**Test Result**:
|
|
|
136 |
|
137 |
|
138 |
## Training
|
|
|
29 |
type: iiith
|
30 |
args: hi
|
31 |
metrics:
|
32 |
+
- name: Custom Dataset Hindi WER
|
33 |
type: wer
|
34 |
+
value: 17.23
|
35 |
+
- name: CommonVoice Hindi (Test) WER
|
36 |
+
type: wer
|
37 |
+
value: 56.46
|
38 |
---
|
39 |
|
40 |
# Wav2Vec2-Large-XLSR-53-Hindi
|
|
|
44 |
- [Indic TTS- IITM](https://www.iitm.ac.in/donlab/tts/index.php) and
|
45 |
- [IIITH - Indic Speech Datasets](http://speech.iiit.ac.in/index.php/research-svl/69.html)
|
46 |
|
47 |
+
The Indic datasets are well balanced across gender and accents. However the CommonVoice dataset is skewed towards male voices
|
48 |
|
|
|
|
|
49 |
|
50 |
+
Fine-tuned on facebook/wav2vec2-large-xlsr-53 using Hindi dataset :: 60 epochs >> 17.05% WER
|
51 |
+
|
52 |
When using this model, make sure that your speech input is sampled at 16kHz.
|
53 |
|
54 |
## Usage
|
|
|
88 |
|
89 |
## Evaluation
|
90 |
|
91 |
+
The model can be evaluated as follows on the following two datasets:
|
92 |
+
1. Custom dataset created from 20% of Indic, IIITH and CV (test): 17.
|
93 |
+
2. CommonVoice Hindi test dataset
|
94 |
+
|
95 |
+
```python
|
96 |
+
import torch
|
97 |
+
import torchaudio
|
98 |
+
from datasets import load_dataset, load_metric
|
99 |
+
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
100 |
+
import re
|
101 |
+
|
102 |
+
## Load the datasets
|
103 |
+
test_dataset = load_dataset("common_voice", "hi", split="test")
|
104 |
+
|
105 |
+
indic = load_dataset("csv", data_files= {'train':"/workspace/data/hi2/indic_train_full.csv",
|
106 |
+
"test": "/workspace/data/hi2/indic_test_full.csv"}, download_mode="force_redownload")
|
107 |
+
iiith = load_dataset("csv", data_files= {"train": "/workspace/data/hi2/iiit_hi_train.csv",
|
108 |
+
"test": "/workspace/data/hi2/iiit_hi_test.csv"}, download_mode="force_redownload")
|
109 |
+
|
110 |
+
## Pre-process datasets and concatenate to create test dataset
|
111 |
+
# Drop columns of common_voice
|
112 |
+
split = ['train', 'test', 'validation', 'other', 'invalidated']
|
113 |
+
|
114 |
+
for sp in split:
|
115 |
+
common_voice[sp] = common_voice[sp].remove_columns(['client_id', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'])
|
116 |
+
|
117 |
+
common_voice = common_voice.rename_column('path', 'audio_path')
|
118 |
+
common_voice = common_voice.rename_column('sentence', 'target_text')
|
119 |
+
|
120 |
+
train_dataset = datasets.concatenate_datasets([indic['train'], iiith['train'], common_voice['train']])
|
121 |
+
test_dataset = datasets.concatenate_datasets([indic['test'], iiith['test'], common_voice['test'], common_voice['validation']])
|
122 |
+
|
123 |
+
## Load model from HF hub
|
124 |
+
|
125 |
+
wer = load_metric("wer")
|
126 |
+
|
127 |
+
processor = Wav2Vec2Processor.from_pretrained("skylord/wav2vec2-large-xlsr-hindi")
|
128 |
+
model = Wav2Vec2ForCTC.from_pretrained("skylord/wav2vec2-large-xlsr-hindi")
|
129 |
+
model.to("cuda")
|
130 |
+
|
131 |
+
chars_to_ignore_regex = '[\,\?\.\!\-\'\;\:\"\“\%\‘\”\�Utrnle\_]'
|
132 |
+
unicode_ignore_regex = r'[dceMaWpmFui\xa0\u200d]' # Some unwanted unicode chars
|
133 |
+
resampler = torchaudio.transforms.Resample(48_000, 16_000)
|
134 |
+
|
135 |
+
# Preprocessing the datasets.
|
136 |
+
# We need to read the aduio files as arrays
|
137 |
+
|
138 |
+
def speech_file_to_array_fn(batch):
|
139 |
+
batch["target_text"] = re.sub(chars_to_ignore_regex, '', batch["target_text"])
|
140 |
+
batch["target_text"] = re.sub(unicode_ignore_regex, '', batch["target_text"])
|
141 |
+
|
142 |
+
speech_array, sampling_rate = torchaudio.load(batch["audio_path"])
|
143 |
+
batch["speech"] = resampler(speech_array).squeeze().numpy()
|
144 |
+
return batch
|
145 |
+
|
146 |
+
test_dataset = test_dataset.map(speech_file_to_array_fn)
|
147 |
+
|
148 |
+
# Preprocessing the datasets.
|
149 |
+
# We need to read the aduio files as arrays
|
150 |
+
|
151 |
+
def evaluate(batch):
|
152 |
+
inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
|
153 |
+
with torch.no_grad():
|
154 |
+
logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
|
155 |
+
pred_ids = torch.argmax(logits, dim=-1)
|
156 |
+
batch["pred_strings"] = processor.batch_decode(pred_ids)
|
157 |
+
return batch
|
158 |
+
|
159 |
+
result = test_dataset.map(evaluate, batched=True, batch_size=8)
|
160 |
+
print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
|
161 |
+
```
|
162 |
+
|
163 |
+
**Test Result on custom dataset**: 17.23 %
|
164 |
+
|
165 |
+
|
166 |
|
167 |
|
168 |
```python
|
|
|
209 |
print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
|
210 |
```
|
211 |
|
212 |
+
**Test Result on CommonVoice**: 56.46 %
|
213 |
+
|
214 |
|
215 |
|
216 |
## Training
|
README.md
CHANGED
@@ -34,7 +34,7 @@ results:
|
|
34 |
value: 17.23
|
35 |
- name: CommonVoice Hindi (Test) WER
|
36 |
type: wer
|
37 |
-
value:
|
38 |
---
|
39 |
|
40 |
# Wav2Vec2-Large-XLSR-53-Hindi
|
@@ -44,11 +44,11 @@ Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav
|
|
44 |
- [Indic TTS- IITM](https://www.iitm.ac.in/donlab/tts/index.php) and
|
45 |
- [IIITH - Indic Speech Datasets](http://speech.iiit.ac.in/index.php/research-svl/69.html)
|
46 |
|
47 |
-
The
|
48 |
|
49 |
-
Fine-tuned on facebook/wav2vec2-large-xlsr-53 using Hindi dataset :: 30 epochs >> 19.05% WER
|
50 |
-
Resuming from checkpoints trained for another XX epochs >> XX.XX%
|
51 |
|
|
|
|
|
52 |
When using this model, make sure that your speech input is sampled at 16kHz.
|
53 |
|
54 |
## Usage
|
@@ -89,7 +89,7 @@ print("Reference:", test_dataset["sentence"][:2])
|
|
89 |
## Evaluation
|
90 |
|
91 |
The model can be evaluated as follows on the following two datasets:
|
92 |
-
1. Custom dataset created from 20% of Indic, IIITH and CV (test)
|
93 |
2. CommonVoice Hindi test dataset
|
94 |
|
95 |
```python
|
@@ -160,7 +160,7 @@ result = test_dataset.map(evaluate, batched=True, batch_size=8)
|
|
160 |
print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
|
161 |
```
|
162 |
|
163 |
-
**Test Result on custom dataset**:
|
164 |
|
165 |
|
166 |
|
@@ -209,7 +209,7 @@ result = test_dataset.map(evaluate, batched=True, batch_size=8)
|
|
209 |
print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
|
210 |
```
|
211 |
|
212 |
-
**Test Result on CommonVoice**:
|
213 |
|
214 |
|
215 |
|
|
|
34 |
value: 17.23
|
35 |
- name: CommonVoice Hindi (Test) WER
|
36 |
type: wer
|
37 |
+
value: 56.46
|
38 |
---
|
39 |
|
40 |
# Wav2Vec2-Large-XLSR-53-Hindi
|
|
|
44 |
- [Indic TTS- IITM](https://www.iitm.ac.in/donlab/tts/index.php) and
|
45 |
- [IIITH - Indic Speech Datasets](http://speech.iiit.ac.in/index.php/research-svl/69.html)
|
46 |
|
47 |
+
The Indic datasets are well balanced across gender and accents. However the CommonVoice dataset is skewed towards male voices
|
48 |
|
|
|
|
|
49 |
|
50 |
+
Fine-tuned on facebook/wav2vec2-large-xlsr-53 using Hindi dataset :: 60 epochs >> 17.05% WER
|
51 |
+
|
52 |
When using this model, make sure that your speech input is sampled at 16kHz.
|
53 |
|
54 |
## Usage
|
|
|
89 |
## Evaluation
|
90 |
|
91 |
The model can be evaluated as follows on the following two datasets:
|
92 |
+
1. Custom dataset created from 20% of Indic, IIITH and CV (test): 17.
|
93 |
2. CommonVoice Hindi test dataset
|
94 |
|
95 |
```python
|
|
|
160 |
print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
|
161 |
```
|
162 |
|
163 |
+
**Test Result on custom dataset**: 17.23 %
|
164 |
|
165 |
|
166 |
|
|
|
209 |
print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
|
210 |
```
|
211 |
|
212 |
+
**Test Result on CommonVoice**: 56.46 %
|
213 |
|
214 |
|
215 |
|
config.json
CHANGED
@@ -70,7 +70,7 @@
|
|
70 |
"num_conv_pos_embeddings": 128,
|
71 |
"num_feat_extract_layers": 7,
|
72 |
"num_hidden_layers": 24,
|
73 |
-
"pad_token_id":
|
74 |
"transformers_version": "4.5.0.dev0",
|
75 |
-
"vocab_size":
|
76 |
}
|
|
|
70 |
"num_conv_pos_embeddings": 128,
|
71 |
"num_feat_extract_layers": 7,
|
72 |
"num_hidden_layers": 24,
|
73 |
+
"pad_token_id": 74,
|
74 |
"transformers_version": "4.5.0.dev0",
|
75 |
+
"vocab_size": 75
|
76 |
}
|
pytorch_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c42f7b3bd5cb4694035320ae412c100449b231a49a05e2194f8cc844646fc697
|
3 |
+
size 1262241303
|
vocab.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"प": 0, "
|
|
|
1 |
+
{"प": 0, "ग": 1, "ौ": 2, "घ": 3, "झ": 4, "ख़": 5, "ष": 6, "छ": 7, "च": 8, "ढ़": 9, "ॅ": 10, "ऱ": 11, "ा": 12, "ख": 13, "ँ": 14, "ड": 15, "ह": 16, "श": 17, "ि": 18, "उ": 19, "ो": 20, "ऊ": 21, "ॉ": 22, "ग़": 23, "ऋ": 24, "्": 25, "ः": 26, "ं": 27, "द": 28, "ळ": 29, "ठ": 30, "ृ": 31, "ल": 32, "ज़": 33, "क": 34, "त": 35, "ध": 36, "ई": 37, "र": 38, "म": 39, "ज": 41, "आ": 42, "भ": 43, "।": 44, "इ": 45, "फ़": 46, "ञ": 47, "य": 48, "ऐ": 49, "ऑ": 50, "ट": 51, "ङ": 52, "थ": 53, "ी": 54, "ु": 55, "े": 56, "ै": 57, "न": 58, "स": 59, "ू": 60, "ब": 61, "ओ": 62, "व": 63, "ड़": 64, "अ": 65, "औ": 66, "़": 67, "क़": 68, "फ": 69, "ढ": 70, "ए": 71, "ण": 72, "/": 40, "[UNK]": 73, "[PAD]": 74}
|