gmihaila
/

wav2vec2-large-xlsr-53-romanian

@@ -1,217 +1,125 @@
 ---
-2
 language: ro
-3
 datasets:
-4
 - common_voice
-5
 tags:
-6
 - audio
-7
 - automatic-speech-recognition
-8
 - speech
-9
 - xlsr-fine-tuning-week
-10
 license: apache-2.0
-11
 model-index:
-12
 - name: XLSR Wav2Vec2 Romanian by George Mihaila
-13
   results:
-14
   - task:
-15
       name: Speech Recognition
-16
       type: automatic-speech-recognition
-17
     dataset:
-18
       name: Common Voice ro
-19
       type: common_voice
-20
       args: {lang_id}
-21
     metrics:
-22
        - name: Test WER
-23
          type: wer
-24
-         value: 37.1
-25
 ---
-26
-27
-# Wav2Vec2-Large-XLSR-53-Turkish
-28
-29
 Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) in Romanian using the [Common Voice](https://huggingface.co/datasets/common_voice)
-30
 When using this model, make sure that your speech input is sampled at 16kHz.
-31
-32
 ## Usage
-33
-34
 The model can be used directly (without a language model) as follows:
-35
-36
 ```python
-37
 import torch
-38
 import torchaudio
-39
 from datasets import load_dataset
-40
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
-41
-42
 test_dataset = load_dataset("common_voice", "ro", split="test[:2%]").
-43
-44
 processor = Wav2Vec2Processor.from_pretrained("gmihaila/wav2vec2-large-xlsr-53-romanian")
-45
 model = Wav2Vec2ForCTC.from_pretrained("gmihaila/wav2vec2-large-xlsr-53-romanian")
-46
-47
 resampler = torchaudio.transforms.Resample(48_000, 16_000)
-48
-49
 # Preprocessing the datasets.
-50
 # We need to read the aduio files as arrays
-51
 def speech_file_to_array_fn(batch):
-52
 	speech_array, sampling_rate = torchaudio.load(batch["path"])
-53
 	batch["speech"] = resampler(speech_array).squeeze().numpy()
-54
 	return batch
-55
-56
 test_dataset = test_dataset.map(speech_file_to_array_fn)
-57
 inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
-58
-59
 with torch.no_grad():
-60
 	logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
-61
-62
 predicted_ids = torch.argmax(logits, dim=-1)
-63
-64
 print("Prediction:", processor.batch_decode(predicted_ids))
-65
 print("Reference:", test_dataset["sentence"][:2])
-66
 ```
-67
-68
-69
 ## Evaluation
-70
-71
 The model can be evaluated as follows on the {language} test data of Common Voice.
-72
-73
-74
 ```python
-75
 import torch
-76
 import torchaudio
-77
 from datasets import load_dataset, load_metric
-78
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
-79
 import re
-80
-81
 test_dataset = load_dataset("common_voice", "ro", split="test")
-82
 wer = load_metric("wer")
-83
-84
 processor = Wav2Vec2Processor.from_pretrained("gmihaila/wav2vec2-large-xlsr-53-romanian")
-85
 model = Wav2Vec2ForCTC.from_pretrained("gmihaila/wav2vec2-large-xlsr-53-romanian")
-86
 model.to("cuda")
-87
-88
-chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“]'
-89
 resampler = torchaudio.transforms.Resample(48_000, 16_000)
-90
-91
 # Preprocessing the datasets.
-92
 # We need to read the aduio files as arrays
-93
 def speech_file_to_array_fn(batch):
-94
 	batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
-95
 	speech_array, sampling_rate = torchaudio.load(batch["path"])
-96
 	batch["speech"] = resampler(speech_array).squeeze().numpy()
-97
 	return batch
-98
-99
 test_dataset = test_dataset.map(speech_file_to_array_fn)
-100
-101
 # Preprocessing the datasets.
-102
 # We need to read the aduio files as arrays
-103
 def evaluate(batch):
-104
 	inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
-105
-106
 	with torch.no_grad():
-107
 		logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
-108
-109
     pred_ids = torch.argmax(logits, dim=-1)
-110
 	batch["pred_strings"] = processor.batch_decode(pred_ids)
-111
 	return batch
-112
-113
 result = test_dataset.map(evaluate, batched=True, batch_size=8)
-114
-115
 print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
-116
 ```
-117
-118
 **Test Result**: 37.10 %
-119
-120
-121
 ## Training
-122
-123
 The Common Voice `train`, `validation` datasets were used for training.
-124
-125
-The script used for training can be found [here]()
-126

 ---
 language: ro
 datasets:
 - common_voice
 tags:
 - audio
 - automatic-speech-recognition
 - speech
 - xlsr-fine-tuning-week
 license: apache-2.0
 model-index:
 - name: XLSR Wav2Vec2 Romanian by George Mihaila
   results:
   - task:
       name: Speech Recognition
       type: automatic-speech-recognition
     dataset:
       name: Common Voice ro
       type: common_voice
       args: {lang_id}
     metrics:
        - name: Test WER
          type: wer
+         value: 40.7
 ---
+# Wav2Vec2-Large-XLSR-53-Romanian
 Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) in Romanian using the [Common Voice](https://huggingface.co/datasets/common_voice)
 When using this model, make sure that your speech input is sampled at 16kHz.
 ## Usage
 The model can be used directly (without a language model) as follows:
 ```python
 import torch
 import torchaudio
 from datasets import load_dataset
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 test_dataset = load_dataset("common_voice", "ro", split="test[:2%]").
 processor = Wav2Vec2Processor.from_pretrained("gmihaila/wav2vec2-large-xlsr-53-romanian")
 model = Wav2Vec2ForCTC.from_pretrained("gmihaila/wav2vec2-large-xlsr-53-romanian")
 resampler = torchaudio.transforms.Resample(48_000, 16_000)
 # Preprocessing the datasets.
 # We need to read the aduio files as arrays
 def speech_file_to_array_fn(batch):
 	speech_array, sampling_rate = torchaudio.load(batch["path"])
 	batch["speech"] = resampler(speech_array).squeeze().numpy()
 	return batch
 test_dataset = test_dataset.map(speech_file_to_array_fn)
 inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
 with torch.no_grad():
 	logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
 predicted_ids = torch.argmax(logits, dim=-1)
 print("Prediction:", processor.batch_decode(predicted_ids))
 print("Reference:", test_dataset["sentence"][:2])
 ```
 ## Evaluation
 The model can be evaluated as follows on the {language} test data of Common Voice.
 ```python
 import torch
 import torchaudio
 from datasets import load_dataset, load_metric
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 import re
 test_dataset = load_dataset("common_voice", "ro", split="test")
 wer = load_metric("wer")
 processor = Wav2Vec2Processor.from_pretrained("gmihaila/wav2vec2-large-xlsr-53-romanian")
 model = Wav2Vec2ForCTC.from_pretrained("gmihaila/wav2vec2-large-xlsr-53-romanian")
 model.to("cuda")
+chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\â€œ]'
 resampler = torchaudio.transforms.Resample(48_000, 16_000)
 # Preprocessing the datasets.
 # We need to read the aduio files as arrays
 def speech_file_to_array_fn(batch):
 	batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
 	speech_array, sampling_rate = torchaudio.load(batch["path"])
 	batch["speech"] = resampler(speech_array).squeeze().numpy()
 	return batch
 test_dataset = test_dataset.map(speech_file_to_array_fn)
 # Preprocessing the datasets.
 # We need to read the aduio files as arrays
 def evaluate(batch):
 	inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
 	with torch.no_grad():
 		logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
     pred_ids = torch.argmax(logits, dim=-1)
 	batch["pred_strings"] = processor.batch_decode(pred_ids)
 	return batch
 result = test_dataset.map(evaluate, batched=True, batch_size=8)
 print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
 ```
 **Test Result**: 37.10 %
 ## Training
 The Common Voice `train`, `validation` datasets were used for training.
+The script used for training can be found [here]()