Update README.md
Browse files
README.md
CHANGED
@@ -26,16 +26,16 @@ model-index:
|
|
26 |
---
|
27 |
|
28 |
# Wav2Vec2-Large-XLSR-53-Marathi
|
29 |
-
Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Marathi using the [
|
30 |
-
**WER (Word Error Rate) on the Test Set**: 12.70 %
|
31 |
## Usage
|
32 |
The model can be used directly without a language model as follows, given that your dataset has Marathi `actual_text` and `path_in_folder` columns:
|
33 |
```python
|
34 |
import torch, torchaudio
|
35 |
-
from datasets import load_dataset
|
36 |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
37 |
|
38 |
-
|
|
|
39 |
|
40 |
processor = Wav2Vec2Processor.from_pretrained("sumedh/wav2vec2-large-xlsr-marathi")
|
41 |
model = Wav2Vec2ForCTC.from_pretrained("sumedh/wav2vec2-large-xlsr-marathi")
|
@@ -46,22 +46,31 @@ def speech_file_to_array_fn(batch):
|
|
46 |
speech_array, sampling_rate = torchaudio.load(batch["path_in_folder"])
|
47 |
batch["speech"] = resampler(speech_array).squeeze().numpy()
|
48 |
return batch
|
49 |
-
|
50 |
-
inputs = processor(
|
51 |
with torch.no_grad():
|
52 |
logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
|
53 |
predicted_ids = torch.argmax(logits, dim=-1)
|
54 |
print("Prediction:", processor.batch_decode(predicted_ids))
|
55 |
-
print("Reference:",
|
56 |
```
|
57 |
## Evaluation
|
58 |
Evaluated on 10% of the Marathi data on Open SLR-64.
|
59 |
```python
|
60 |
import re, torch, torchaudio
|
61 |
-
from datasets import
|
62 |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
63 |
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
wer = load_metric("wer")
|
66 |
|
67 |
processor = Wav2Vec2Processor.from_pretrained("sumedh/wav2vec2-large-xlsr-marathi")
|
@@ -76,7 +85,7 @@ def speech_file_to_array_fn(batch):
|
|
76 |
speech_array, sampling_rate = torchaudio.load(batch["path_in_folder"])
|
77 |
batch["speech"] = resampler(speech_array).squeeze().numpy()
|
78 |
return batch
|
79 |
-
|
80 |
def evaluate(batch):
|
81 |
inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
|
82 |
with torch.no_grad():
|
@@ -84,13 +93,13 @@ def evaluate(batch):
|
|
84 |
pred_ids = torch.argmax(logits, dim=-1)
|
85 |
batch["pred_strings"] = processor.batch_decode(pred_ids)
|
86 |
return batch
|
87 |
-
result =
|
88 |
print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["actual_text"])))
|
89 |
```
|
90 |
|
91 |
## Training
|
92 |
Train-Test ratio was 90:10.
|
93 |
-
|
94 |
|
95 |
## Training Config and Summary
|
96 |
weights-and-biases run summary [here](https://wandb.ai/wandb/xlsr/runs/3itdhtb8/overview?workspace=user-sumedhkhodke)
|
|
|
26 |
---
|
27 |
|
28 |
# Wav2Vec2-Large-XLSR-53-Marathi
|
29 |
+
Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Marathi using the [Open SLR64](http://openslr.org/64/) dataset. When using this model, make sure that your speech input is sampled at 16kHz. This data contains only female voices but the model works well for male voices too. Trained on Google Colab Pro on Tesla P100 16GB GPU.<br>
|
30 |
+
**WER (Word Error Rate) on the Test Set**: 12.70 %
|
31 |
## Usage
|
32 |
The model can be used directly without a language model as follows, given that your dataset has Marathi `actual_text` and `path_in_folder` columns:
|
33 |
```python
|
34 |
import torch, torchaudio
|
|
|
35 |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
36 |
|
37 |
+
#Since marathi is not present on Common Voice, script for reading the below dataset can be picked up from the eval script below
|
38 |
+
mr_test_dataset = all_data['test']
|
39 |
|
40 |
processor = Wav2Vec2Processor.from_pretrained("sumedh/wav2vec2-large-xlsr-marathi")
|
41 |
model = Wav2Vec2ForCTC.from_pretrained("sumedh/wav2vec2-large-xlsr-marathi")
|
|
|
46 |
speech_array, sampling_rate = torchaudio.load(batch["path_in_folder"])
|
47 |
batch["speech"] = resampler(speech_array).squeeze().numpy()
|
48 |
return batch
|
49 |
+
mr_test_dataset = mr_test_dataset.map(speech_file_to_array_fn)
|
50 |
+
inputs = processor(mr_test_dataset["speech"][:5], sampling_rate=16_000, return_tensors="pt", padding=True)
|
51 |
with torch.no_grad():
|
52 |
logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
|
53 |
predicted_ids = torch.argmax(logits, dim=-1)
|
54 |
print("Prediction:", processor.batch_decode(predicted_ids))
|
55 |
+
print("Reference:", mr_test_dataset["actual_text"][:5])
|
56 |
```
|
57 |
## Evaluation
|
58 |
Evaluated on 10% of the Marathi data on Open SLR-64.
|
59 |
```python
|
60 |
import re, torch, torchaudio
|
61 |
+
from datasets import load_metric
|
62 |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
63 |
|
64 |
+
#below is a custom script to be used for reading marathi dataset since its not present on the Common Voice
|
65 |
+
dataset_path = "./OpenSLR-64_Marathi/mr_in_female/" #TODO : include the path of the dataset extracted from http://openslr.org/64/
|
66 |
+
audio_df = pd.read_csv(os.path.join(dataset_path,'line_index.tsv'),sep='\t',header=None)
|
67 |
+
audio_df.columns = ['path_in_folder','actual_text']
|
68 |
+
audio_df['path_in_folder'] = audio_df['path_in_folder'].apply(lambda x: dataset_path + x + '.wav')
|
69 |
+
audio_df = audio_df.sample(frac=1, random_state=2020).reset_index(drop=True) #seed number is important for reproducibility of WER score
|
70 |
+
all_data = Dataset.from_pandas(audio_df)
|
71 |
+
all_data = all_data.train_test_split(test_size=0.10,seed=2020) #seed number is important for reproducibility of WER score
|
72 |
+
|
73 |
+
mr_test_dataset = all_data['test']
|
74 |
wer = load_metric("wer")
|
75 |
|
76 |
processor = Wav2Vec2Processor.from_pretrained("sumedh/wav2vec2-large-xlsr-marathi")
|
|
|
85 |
speech_array, sampling_rate = torchaudio.load(batch["path_in_folder"])
|
86 |
batch["speech"] = resampler(speech_array).squeeze().numpy()
|
87 |
return batch
|
88 |
+
mr_test_dataset = mr_test_dataset.map(speech_file_to_array_fn)
|
89 |
def evaluate(batch):
|
90 |
inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
|
91 |
with torch.no_grad():
|
|
|
93 |
pred_ids = torch.argmax(logits, dim=-1)
|
94 |
batch["pred_strings"] = processor.batch_decode(pred_ids)
|
95 |
return batch
|
96 |
+
result = mr_test_dataset.map(evaluate, batched=True, batch_size=8)
|
97 |
print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["actual_text"])))
|
98 |
```
|
99 |
|
100 |
## Training
|
101 |
Train-Test ratio was 90:10.
|
102 |
+
The training notebook Colab link [here](https://colab.research.google.com/drive/1wX46fjExcgU5t3AsWhSPTipWg_aMDg2f?usp=sharing).
|
103 |
|
104 |
## Training Config and Summary
|
105 |
weights-and-biases run summary [here](https://wandb.ai/wandb/xlsr/runs/3itdhtb8/overview?workspace=user-sumedhkhodke)
|