Dandan0K commited on
Commit
4cf69d0
1 Parent(s): 922b1c1

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -1,35 +1,27 @@
1
  *.7z filter=lfs diff=lfs merge=lfs -text
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
 
4
  *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
  *.ftz filter=lfs diff=lfs merge=lfs -text
7
  *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
  *.onnx filter=lfs diff=lfs merge=lfs -text
17
  *.ot filter=lfs diff=lfs merge=lfs -text
18
  *.parquet filter=lfs diff=lfs merge=lfs -text
19
  *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
  *.pt filter=lfs diff=lfs merge=lfs -text
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
  *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
  *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
  *.7z filter=lfs diff=lfs merge=lfs -text
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
  *.bz2 filter=lfs diff=lfs merge=lfs -text
 
6
  *.ftz filter=lfs diff=lfs merge=lfs -text
7
  *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
 
11
  *.model filter=lfs diff=lfs merge=lfs -text
12
  *.msgpack filter=lfs diff=lfs merge=lfs -text
 
 
13
  *.onnx filter=lfs diff=lfs merge=lfs -text
14
  *.ot filter=lfs diff=lfs merge=lfs -text
15
  *.parquet filter=lfs diff=lfs merge=lfs -text
16
  *.pb filter=lfs diff=lfs merge=lfs -text
 
 
17
  *.pt filter=lfs diff=lfs merge=lfs -text
18
  *.pth filter=lfs diff=lfs merge=lfs -text
19
  *.rar filter=lfs diff=lfs merge=lfs -text
20
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 
21
  *.tar.* filter=lfs diff=lfs merge=lfs -text
 
22
  *.tflite filter=lfs diff=lfs merge=lfs -text
23
  *.tgz filter=lfs diff=lfs merge=lfs -text
 
24
  *.xz filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - fr
4
+ license: apache-2.0
5
+ tags:
6
+ - automatic-speech-recognition
7
+ - fr
8
+ - hf-asr-leaderboard
9
+ - mozilla-foundation/common_voice_8_0
10
+ - robust-speech-event
11
+ datasets:
12
+ - mozilla-foundation/common_voice_8_0
13
+ model-index:
14
+ - name: XLS-R Wav2Vec2 French by Jonatas Grosman
15
+ results:
16
+ - task:
17
+ name: Automatic Speech Recognition
18
+ type: automatic-speech-recognition
19
+ dataset:
20
+ name: Common Voice 8
21
+ type: mozilla-foundation/common_voice_8_0
22
+ args: fr
23
+ metrics:
24
+ - name: Test WER
25
+ type: wer
26
+ value: 16.85
27
+ - name: Test CER
28
+ type: cer
29
+ value: 4.66
30
+ - name: Test WER (+LM)
31
+ type: wer
32
+ value: 16.32
33
+ - name: Test CER (+LM)
34
+ type: cer
35
+ value: 4.21
36
+ - task:
37
+ name: Automatic Speech Recognition
38
+ type: automatic-speech-recognition
39
+ dataset:
40
+ name: Robust Speech Event - Dev Data
41
+ type: speech-recognition-community-v2/dev_data
42
+ args: fr
43
+ metrics:
44
+ - name: Dev WER
45
+ type: wer
46
+ value: 22.34
47
+ - name: Dev CER
48
+ type: cer
49
+ value: 9.88
50
+ - name: Dev WER (+LM)
51
+ type: wer
52
+ value: 17.16
53
+ - name: Dev CER (+LM)
54
+ type: cer
55
+ value: 9.38
56
+ - task:
57
+ name: Automatic Speech Recognition
58
+ type: automatic-speech-recognition
59
+ dataset:
60
+ name: Robust Speech Event - Test Data
61
+ type: speech-recognition-community-v2/eval_data
62
+ args: fr
63
+ metrics:
64
+ - name: Test WER
65
+ type: wer
66
+ value: 19.15
67
+ ---
68
+
69
+ # Fine-tuned XLS-R 1B model for speech recognition in French
70
+
71
+ Fine-tuned [facebook/wav2vec2-xls-r-1b](https://huggingface.co/facebook/wav2vec2-xls-r-1b) on French using the train and validation splits of [Common Voice 8.0](https://huggingface.co/datasets/mozilla-foundation/common_voice_8_0), [MediaSpeech](https://www.openslr.org/108/), [Multilingual TEDx](http://www.openslr.org/100), [Multilingual LibriSpeech](https://www.openslr.org/94/), and [Voxpopuli](https://github.com/facebookresearch/voxpopuli).
72
+ When using this model, make sure that your speech input is sampled at 16kHz.
73
+
74
+ This model has been fine-tuned by the [HuggingSound](https://github.com/jonatasgrosman/huggingsound) tool, and thanks to the GPU credits generously given by the [OVHcloud](https://www.ovhcloud.com/en/public-cloud/ai-training/) :)
75
+
76
+ ## Usage
77
+
78
+ Using the [HuggingSound](https://github.com/jonatasgrosman/huggingsound) library:
79
+
80
+ ```python
81
+ from huggingsound import SpeechRecognitionModel
82
+
83
+ model = SpeechRecognitionModel("jonatasgrosman/wav2vec2-xls-r-1b-french")
84
+ audio_paths = ["/path/to/file.mp3", "/path/to/another_file.wav"]
85
+
86
+ transcriptions = model.transcribe(audio_paths)
87
+ ```
88
+
89
+ Writing your own inference script:
90
+
91
+ ```python
92
+ import torch
93
+ import librosa
94
+ from datasets import load_dataset
95
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
96
+
97
+ LANG_ID = "fr"
98
+ MODEL_ID = "jonatasgrosman/wav2vec2-xls-r-1b-french"
99
+ SAMPLES = 10
100
+
101
+ test_dataset = load_dataset("common_voice", LANG_ID, split=f"test[:{SAMPLES}]")
102
+
103
+ processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
104
+ model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
105
+
106
+ # Preprocessing the datasets.
107
+ # We need to read the audio files as arrays
108
+ def speech_file_to_array_fn(batch):
109
+ speech_array, sampling_rate = librosa.load(batch["path"], sr=16_000)
110
+ batch["speech"] = speech_array
111
+ batch["sentence"] = batch["sentence"].upper()
112
+ return batch
113
+
114
+ test_dataset = test_dataset.map(speech_file_to_array_fn)
115
+ inputs = processor(test_dataset["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
116
+
117
+ with torch.no_grad():
118
+ logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
119
+
120
+ predicted_ids = torch.argmax(logits, dim=-1)
121
+ predicted_sentences = processor.batch_decode(predicted_ids)
122
+ ```
123
+
124
+ ## Evaluation Commands
125
+
126
+ 1. To evaluate on `mozilla-foundation/common_voice_8_0` with split `test`
127
+
128
+ ```bash
129
+ python eval.py --model_id jonatasgrosman/wav2vec2-xls-r-1b-french --dataset mozilla-foundation/common_voice_8_0 --config fr --split test
130
+ ```
131
+
132
+ 2. To evaluate on `speech-recognition-community-v2/dev_data`
133
+
134
+ ```bash
135
+ python eval.py --model_id jonatasgrosman/wav2vec2-xls-r-1b-french --dataset speech-recognition-community-v2/dev_data --config fr --split validation --chunk_length_s 5.0 --stride_length_s 1.0
136
+ ```
137
+
138
+ ## Citation
139
+ If you want to cite this model you can use this:
140
+
141
+ ```bibtex
142
+ @misc{grosman2021xlsr-1b-french,
143
+ title={Fine-tuned {XLS-R} 1{B} model for speech recognition in {F}rench},
144
+ author={Grosman, Jonatas},
145
+ howpublished={\url{https://huggingface.co/jonatasgrosman/wav2vec2-xls-r-1b-french}},
146
+ year={2022}
147
+ }
148
+ ```
alphabet.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"labels": ["", "<s>", "</s>", "\u2047", " ", "'", "-", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "\u00e0", "\u00e2", "\u00e3", "\u00e7", "\u00e8", "\u00e9", "\u00ea", "\u00eb", "\u00ee", "\u00ef", "\u00f4", "\u00f9", "\u00fb", "\u0153"], "is_bpe": false}
config.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/wav2vec2-xls-r-1b",
3
+ "activation_dropout": 0.05,
4
+ "adapter_kernel_size": 3,
5
+ "adapter_stride": 2,
6
+ "add_adapter": false,
7
+ "apply_spec_augment": true,
8
+ "architectures": [
9
+ "Wav2Vec2ForCTC"
10
+ ],
11
+ "attention_dropout": 0.05,
12
+ "bos_token_id": 1,
13
+ "classifier_proj_size": 256,
14
+ "codevector_dim": 1024,
15
+ "contrastive_logits_temperature": 0.1,
16
+ "conv_bias": true,
17
+ "conv_dim": [
18
+ 512,
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512
25
+ ],
26
+ "conv_kernel": [
27
+ 10,
28
+ 3,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 2,
33
+ 2
34
+ ],
35
+ "conv_stride": [
36
+ 5,
37
+ 2,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2
43
+ ],
44
+ "ctc_loss_reduction": "mean",
45
+ "ctc_zero_infinity": false,
46
+ "diversity_loss_weight": 0.1,
47
+ "do_stable_layer_norm": true,
48
+ "eos_token_id": 2,
49
+ "feat_extract_activation": "gelu",
50
+ "feat_extract_dropout": 0.0,
51
+ "feat_extract_norm": "layer",
52
+ "feat_proj_dropout": 0.05,
53
+ "feat_quantizer_dropout": 0.0,
54
+ "final_dropout": 0.05,
55
+ "hidden_act": "gelu",
56
+ "hidden_dropout": 0.05,
57
+ "hidden_size": 1280,
58
+ "initializer_range": 0.02,
59
+ "intermediate_size": 5120,
60
+ "layer_norm_eps": 1e-05,
61
+ "layerdrop": 0.05,
62
+ "mask_feature_length": 10,
63
+ "mask_feature_min_masks": 0,
64
+ "mask_feature_prob": 0.0,
65
+ "mask_time_length": 10,
66
+ "mask_time_min_masks": 2,
67
+ "mask_time_prob": 0.05,
68
+ "model_type": "wav2vec2",
69
+ "num_adapter_layers": 3,
70
+ "num_attention_heads": 16,
71
+ "num_codevector_groups": 2,
72
+ "num_codevectors_per_group": 320,
73
+ "num_conv_pos_embedding_groups": 16,
74
+ "num_conv_pos_embeddings": 128,
75
+ "num_feat_extract_layers": 7,
76
+ "num_hidden_layers": 48,
77
+ "num_negatives": 100,
78
+ "output_hidden_size": 1280,
79
+ "pad_token_id": 0,
80
+ "proj_codevector_dim": 1024,
81
+ "tdnn_dilation": [
82
+ 1,
83
+ 2,
84
+ 3,
85
+ 1,
86
+ 1
87
+ ],
88
+ "tdnn_dim": [
89
+ 512,
90
+ 512,
91
+ 512,
92
+ 512,
93
+ 1500
94
+ ],
95
+ "tdnn_kernel": [
96
+ 5,
97
+ 3,
98
+ 3,
99
+ 1,
100
+ 1
101
+ ],
102
+ "torch_dtype": "float32",
103
+ "transformers_version": "4.16.0.dev0",
104
+ "use_weighted_layer_sum": false,
105
+ "vocab_size": 47,
106
+ "xvector_output_dim": 512
107
+ }
eval.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from datasets import load_dataset, load_metric, Audio, Dataset
3
+ from transformers import pipeline, AutoFeatureExtractor, AutoTokenizer, AutoConfig, AutoModelForCTC, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM
4
+ import re
5
+ import torch
6
+ import argparse
7
+ from typing import Dict
8
+
9
+ def log_results(result: Dataset, args: Dict[str, str]):
10
+ """ DO NOT CHANGE. This function computes and logs the result metrics. """
11
+
12
+ log_outputs = args.log_outputs
13
+ dataset_id = "_".join(args.dataset.split("/") + [args.config, args.split])
14
+
15
+ # load metric
16
+ wer = load_metric("wer")
17
+ cer = load_metric("cer")
18
+
19
+ # compute metrics
20
+ wer_result = wer.compute(references=result["target"], predictions=result["prediction"])
21
+ cer_result = cer.compute(references=result["target"], predictions=result["prediction"])
22
+
23
+ # print & log results
24
+ result_str = (
25
+ f"WER: {wer_result}\n"
26
+ f"CER: {cer_result}"
27
+ )
28
+ print(result_str)
29
+
30
+ with open(f"{dataset_id}_eval_results.txt", "w") as f:
31
+ f.write(result_str)
32
+
33
+ # log all results in text file. Possibly interesting for analysis
34
+ if log_outputs is not None:
35
+ pred_file = f"log_{dataset_id}_predictions.txt"
36
+ target_file = f"log_{dataset_id}_targets.txt"
37
+
38
+ with open(pred_file, "w") as p, open(target_file, "w") as t:
39
+
40
+ # mapping function to write output
41
+ def write_to_file(batch, i):
42
+ p.write(f"{i}" + "\n")
43
+ p.write(batch["prediction"] + "\n")
44
+ t.write(f"{i}" + "\n")
45
+ t.write(batch["target"] + "\n")
46
+
47
+ result.map(write_to_file, with_indices=True)
48
+
49
+
50
+ def normalize_text(text: str, invalid_chars_regex: str, to_lower: bool) -> str:
51
+ """ DO ADAPT FOR YOUR USE CASE. this function normalizes the target text. """
52
+
53
+ text = text.lower() if to_lower else text.upper()
54
+
55
+ text = re.sub(invalid_chars_regex, " ", text)
56
+
57
+ text = re.sub("\s+", " ", text).strip()
58
+
59
+ return text
60
+
61
+
62
+ def main(args):
63
+ # load dataset
64
+ dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
65
+
66
+ # for testing: only process the first two examples as a test
67
+ # dataset = dataset.select(range(10))
68
+
69
+ # load processor
70
+ if args.greedy:
71
+ processor = Wav2Vec2Processor.from_pretrained(args.model_id)
72
+ decoder = None
73
+ else:
74
+ processor = Wav2Vec2ProcessorWithLM.from_pretrained(args.model_id)
75
+ decoder = processor.decoder
76
+
77
+ feature_extractor = processor.feature_extractor
78
+ tokenizer = processor.tokenizer
79
+
80
+ # resample audio
81
+ dataset = dataset.cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate))
82
+
83
+ # load eval pipeline
84
+ if args.device is None:
85
+ args.device = 0 if torch.cuda.is_available() else -1
86
+
87
+ config = AutoConfig.from_pretrained(args.model_id)
88
+ model = AutoModelForCTC.from_pretrained(args.model_id)
89
+
90
+ #asr = pipeline("automatic-speech-recognition", model=args.model_id, device=args.device)
91
+ asr = pipeline("automatic-speech-recognition", config=config, model=model, tokenizer=tokenizer,
92
+ feature_extractor=feature_extractor, decoder=decoder, device=args.device)
93
+
94
+ # build normalizer config
95
+ tokenizer = AutoTokenizer.from_pretrained(args.model_id)
96
+ tokens = [x for x in tokenizer.convert_ids_to_tokens(range(0, tokenizer.vocab_size))]
97
+ special_tokens = [
98
+ tokenizer.pad_token, tokenizer.word_delimiter_token,
99
+ tokenizer.unk_token, tokenizer.bos_token,
100
+ tokenizer.eos_token,
101
+ ]
102
+ non_special_tokens = [x for x in tokens if x not in special_tokens]
103
+ invalid_chars_regex = f"[^\s{re.escape(''.join(set(non_special_tokens)))}]"
104
+ normalize_to_lower = False
105
+ for token in non_special_tokens:
106
+ if token.isalpha() and token.islower():
107
+ normalize_to_lower = True
108
+ break
109
+
110
+ # map function to decode audio
111
+ def map_to_pred(batch, args=args, asr=asr, invalid_chars_regex=invalid_chars_regex, normalize_to_lower=normalize_to_lower):
112
+ prediction = asr(batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s)
113
+
114
+ batch["prediction"] = prediction["text"]
115
+ batch["target"] = normalize_text(batch["sentence"], invalid_chars_regex, normalize_to_lower)
116
+ return batch
117
+
118
+ # run inference on all examples
119
+ result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
120
+
121
+ # filtering out empty targets
122
+ result = result.filter(lambda example: example["target"] != "")
123
+
124
+ # compute and log_results
125
+ # do not change function below
126
+ log_results(result, args)
127
+
128
+
129
+ if __name__ == "__main__":
130
+ parser = argparse.ArgumentParser()
131
+
132
+ parser.add_argument(
133
+ "--model_id", type=str, required=True, help="Model identifier. Should be loadable with 🤗 Transformers"
134
+ )
135
+ parser.add_argument(
136
+ "--dataset", type=str, required=True, help="Dataset name to evaluate the `model_id`. Should be loadable with 🤗 Datasets"
137
+ )
138
+ parser.add_argument(
139
+ "--config", type=str, required=True, help="Config of the dataset. *E.g.* `'en'` for Common Voice"
140
+ )
141
+ parser.add_argument(
142
+ "--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`"
143
+ )
144
+ parser.add_argument(
145
+ "--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to None. For long audio files a good value would be 5.0 seconds."
146
+ )
147
+ parser.add_argument(
148
+ "--stride_length_s", type=float, default=None, help="Stride of the audio chunks. Defaults to None. For long audio files a good value would be 1.0 seconds."
149
+ )
150
+ parser.add_argument(
151
+ "--log_outputs", action='store_true', help="If defined, write outputs to log file for analysis."
152
+ )
153
+ parser.add_argument(
154
+ "--greedy", action='store_true', help="If defined, the LM will be ignored during inference."
155
+ )
156
+ parser.add_argument(
157
+ "--device",
158
+ type=int,
159
+ default=None,
160
+ help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.",
161
+ )
162
+ args = parser.parse_args()
163
+
164
+ main(args)
full_eval.sh ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CV 8 - TEST
2
+
3
+ python eval.py --model_id jonatasgrosman/wav2vec2-xls-r-1b-french --dataset mozilla-foundation/common_voice_8_0 --config fr --split test --log_outputs --greedy
4
+ mv log_mozilla-foundation_common_voice_8_0_fr_test_predictions.txt log_mozilla-foundation_common_voice_8_0_fr_test_predictions_greedy.txt
5
+ mv mozilla-foundation_common_voice_8_0_fr_test_eval_results.txt mozilla-foundation_common_voice_8_0_fr_test_eval_results_greedy.txt
6
+
7
+ python eval.py --model_id jonatasgrosman/wav2vec2-xls-r-1b-french --dataset mozilla-foundation/common_voice_8_0 --config fr --split test --log_outputs
8
+
9
+ # HF EVENT - DEV
10
+
11
+ python eval.py --model_id jonatasgrosman/wav2vec2-xls-r-1b-french --dataset speech-recognition-community-v2/dev_data --config fr --split validation --chunk_length_s 5.0 --stride_length_s 1.0 --log_outputs --greedy
12
+ mv log_speech-recognition-community-v2_dev_data_fr_validation_predictions.txt log_speech-recognition-community-v2_dev_data_fr_validation_predictions_greedy.txt
13
+ mv speech-recognition-community-v2_dev_data_fr_validation_eval_results.txt speech-recognition-community-v2_dev_data_fr_validation_eval_results_greedy.txt
14
+
15
+ python eval.py --model_id jonatasgrosman/wav2vec2-xls-r-1b-french --dataset speech-recognition-community-v2/dev_data --config fr --split validation --chunk_length_s 5.0 --stride_length_s 1.0 --log_outputs
language_model/2gram_Fr_Hum_no_df1.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16f10db2ce07e1d2405f61cecb6599d8372a61893a2ed2147de525eb9ab7ce48
3
+ size 34663
language_model/attrs.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"alpha": 0.5, "beta": 1.5, "unk_score_offset": -10.0, "score_boundary": true}
language_model/unigrams.txt ADDED
@@ -0,0 +1,570 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ </s>
2
+ <s>
3
+ Lagume
4
+ a
5
+ absence
6
+ abzence
7
+ agile
8
+ agrguile
9
+ al
10
+ amour
11
+ anglage
12
+ archestre
13
+ argile
14
+ attience
15
+ b
16
+ bacurelle
17
+ baie
18
+ bain
19
+ baissi
20
+ balade
21
+ balan
22
+ bambou
23
+ ban
24
+ bante
25
+ bap
26
+ bapetème
27
+ bapetême
28
+ bapt
29
+ baptem
30
+ barrage
31
+ bas
32
+ basket
33
+ bastia
34
+ bastioa
35
+ bastioi
36
+ bastioire
37
+ bastiore
38
+ basto
39
+ bastoire
40
+ baston
41
+ bastriore
42
+ bate
43
+ belai
44
+ ben
45
+ bente
46
+ berdé
47
+ beuil
48
+ beuile
49
+ bian
50
+ biberon
51
+ bibon
52
+ bien
53
+ bigacelle
54
+ bile
55
+ bille
56
+ bof
57
+ bombfage
58
+ bon
59
+ bonfage
60
+ bonfague
61
+ bonheur
62
+ bonjour
63
+ bouson
64
+ brume
65
+ brumedurence
66
+ brumidurence
67
+ brune
68
+ bucurelle
69
+ bulé
70
+ buson
71
+ busson
72
+
73
+ béret
74
+ bœuf
75
+ c'est
76
+ ca
77
+ cachar
78
+ cachoi
79
+ cachoir
80
+ cag
81
+ cage
82
+ cahdfo
83
+ calier
84
+ calin
85
+ campagne
86
+ canch
87
+ canchar
88
+ canchoir
89
+ canchoirche
90
+ cane
91
+ canechoir
92
+ canechou
93
+ canjeau
94
+ canoir
95
+ caravelle
96
+ carin
97
+ carotte
98
+ carré
99
+ cartié
100
+ carton
101
+ case
102
+ casier
103
+ casse
104
+ cassier
105
+ catastrophe
106
+ cecreau
107
+ cellier
108
+ cellière
109
+ cenlier
110
+ cerceau
111
+ cha
112
+ chabre
113
+ chado
114
+ chagr
115
+ chagrin
116
+ chaleur
117
+ cham
118
+ chambre
119
+ champagne
120
+ chaneso
121
+ chanson
122
+ chanzon
123
+ chem
124
+ chemin
125
+ chemisier
126
+ chemissier
127
+ choir
128
+ choix
129
+ cien
130
+ cieux
131
+ cio
132
+ coinecidence
133
+ commande
134
+ compagne
135
+ competeur
136
+ compteur
137
+ comtrueur
138
+ cométon
139
+ concidence
140
+ congé
141
+ conjeau
142
+ conjeu
143
+ consige
144
+ consigne
145
+ consinge
146
+ copeur
147
+ cotice
148
+ couette
149
+ couleur
150
+ coï
151
+ coïncidence
152
+ cran
153
+ cret
154
+ crâne
155
+ crêne
156
+ crêpe
157
+ crête
158
+ cuisine
159
+ cuisinier
160
+ curitece
161
+ cutice
162
+ cuvition
163
+ d'accord
164
+ da
165
+ daveau
166
+ de
167
+ deil
168
+ demain
169
+ demande
170
+ dente
171
+ devo
172
+ dieux
173
+ digacelle
174
+ dijacelé
175
+ dille
176
+ diège
177
+ duie
178
+ duit
179
+ débarquement
180
+ déjic
181
+ délice
182
+ déménagement
183
+ département
184
+ déret
185
+ désir
186
+ déss
187
+ déssir
188
+ dévonne
189
+ echau
190
+ engagé
191
+ escor
192
+ escroc
193
+ escroque
194
+ espitème
195
+ excitation
196
+ f
197
+ facade
198
+ face
199
+ facette
200
+ facteu
201
+ facteur
202
+ fai
203
+ faience
204
+ faisceau
205
+ faisque
206
+ fannence
207
+ fari
208
+ farine
209
+ fasco
210
+ façade
211
+ faïence
212
+ fendant
213
+ fer
214
+ fera
215
+ ferrelle
216
+ ferreuille
217
+ fi
218
+ fide
219
+ fidenete
220
+ fident
221
+ fidneute
222
+ fido
223
+ fidène
224
+ fidé
225
+ figié
226
+ figue
227
+ figure
228
+ fijure
229
+ file
230
+ fille
231
+ filon
232
+ filou
233
+ filé
234
+ filére
235
+ fir
236
+ firoie
237
+ fleur
238
+ foire
239
+ fon
240
+ for
241
+ fossile
242
+ fossé
243
+ frère
244
+ frére
245
+ fureu
246
+ fureur
247
+ furie
248
+ fégré
249
+ garçon
250
+ garçonnet
251
+ genevrier
252
+ genévrier
253
+ gidon
254
+ gigot
255
+ gou
256
+ gouache
257
+ grasentillon
258
+ grassention
259
+ grossan
260
+ grève
261
+ gudon
262
+ gui
263
+ guidion
264
+ guidon
265
+ guignon
266
+ guigot
267
+ guigote
268
+ guiro
269
+ guy
270
+ guydon
271
+
272
+ géant
273
+ gédo
274
+ gévrier
275
+ haie
276
+ haine
277
+ hauteur
278
+ heure
279
+ heureux
280
+ hupeur
281
+ héteur
282
+ hôtel
283
+ ielle
284
+ in
285
+ indiscrétion
286
+ indistranti
287
+ indistrétion
288
+ inetelingé
289
+ initilition
290
+ inittiation
291
+ instrument
292
+ instruments
293
+ intelligence
294
+ invitation
295
+ ive
296
+ j
297
+ jagul
298
+ jagule
299
+ jan
300
+ jar
301
+ jardin
302
+ jardinet
303
+ jauf
304
+ jeu
305
+ jeul
306
+ jiédo
307
+ joage
308
+ joie
309
+ joix
310
+ jol
311
+ joli
312
+ joue
313
+ jour
314
+ judion
315
+ judo
316
+ juidon
317
+ juin
318
+ juit
319
+ jéan
320
+ kerceau
321
+ la
322
+ lageule
323
+ laglu
324
+ lagu
325
+ lagueule
326
+ lagul
327
+ lagum
328
+ lait
329
+ lajle
330
+ laju
331
+ lajul
332
+ lajun
333
+ lamon
334
+ lessoie
335
+ leu
336
+ li
337
+ lieux
338
+ lin
339
+ liège
340
+ lof
341
+ loif
342
+ lumèce
343
+ lumé
344
+ légenbe
345
+ légende
346
+ légume
347
+ lésoie
348
+ malic
349
+ malice
350
+ man
351
+ manivelle
352
+ mat
353
+ matou
354
+ men
355
+ menuisier
356
+ meuil
357
+ mieul
358
+ milvan
359
+ milvenet
360
+ minvil
361
+ mirabelle
362
+ mise
363
+ misse
364
+ montagne
365
+ mouette
366
+ moulin
367
+ moulinet
368
+ mouzna
369
+ muette
370
+ musson
371
+ métal
372
+ métou
373
+ n
374
+ nage
375
+ nardé
376
+ natice
377
+ navet
378
+ navette
379
+ neo
380
+ ninoie
381
+ ninoir
382
+ noed
383
+ noine
384
+ noir
385
+ noix
386
+ nombre
387
+ non
388
+ notic
389
+ notice
390
+ nuit
391
+ nœuds
392
+ ocueil
393
+ opposition
394
+ oppossition
395
+ opposttion
396
+ orchaistre
397
+ orjestre
398
+ oté
399
+ ouest
400
+ our
401
+ pacoc
402
+ pain
403
+ pané
404
+ parade
405
+ parane
406
+ paraso
407
+ parné
408
+ paroce
409
+ passé
410
+ pastoire
411
+ paten
412
+ patien
413
+ patience
414
+ pattience
415
+ paus
416
+ pause
417
+ pausse
418
+ paussé
419
+ pefau
420
+ pente
421
+ perné
422
+ phè
423
+ phème
424
+ pinson
425
+ piné
426
+ pireau
427
+ piège
428
+ pléfant
429
+ pléfantion
430
+ pléfanttion
431
+ poi
432
+ poids
433
+ point
434
+ poire
435
+ posé
436
+ production
437
+ prune
438
+ précision
439
+ précission
440
+ préoduction
441
+ puli
442
+ pun
443
+ pune
444
+
445
+ pâté
446
+ quile
447
+ quille
448
+ quégé
449
+ r
450
+ rac
451
+ race
452
+ raie
453
+ rari
454
+ ras
455
+ rasse
456
+ rata
457
+ ratase
458
+ ratassa
459
+ ratasse
460
+ ratu
461
+ reconnaissance
462
+ rega
463
+ regret
464
+ rept
465
+ reptile
466
+ reugal
467
+ reux
468
+ revenu
469
+ rille
470
+ rof
471
+ ronjeau
472
+ rose
473
+ rosier
474
+ rue
475
+ ruit
476
+ rute
477
+ réc
478
+ réclamationpune
479
+ réconnaissance
480
+ régal
481
+ réjal
482
+ saison
483
+ saisson
484
+ sanchoir
485
+ sauvage
486
+ se
487
+ sec
488
+ secret
489
+ secrette
490
+ sepetième
491
+ septième
492
+ si
493
+ sico
494
+ sido
495
+ sidomelle
496
+ siège
497
+ sière
498
+ solo
499
+ sorpile
500
+ souré
501
+ stade
502
+ stand
503
+ ste
504
+ style
505
+ stylo
506
+ suf
507
+ t
508
+ ta
509
+ taille
510
+ tan
511
+ tanjeau
512
+ tarbion
513
+ tartatasse
514
+ te
515
+ ten
516
+ tenelle
517
+ tennaille
518
+ tente
519
+ ter
520
+ terfone
521
+ terfonet
522
+ terreau
523
+ test
524
+ tiefeul
525
+ timbre
526
+ tison
527
+ tiveau
528
+ tiège
529
+ tiégé
530
+ toire
531
+ toit
532
+ ton
533
+ tondé
534
+ toutefindulence
535
+ tradition
536
+ tradittion
537
+ trufondulence
538
+ trèbe
539
+ trède
540
+ trème
541
+ trébé
542
+ trédion
543
+ tufonle
544
+ tuit
545
+ tuite
546
+ tulème
547
+ tulé
548
+ tume
549
+ tumepune
550
+ turite
551
+ turlème
552
+
553
+ ténaille
554
+ valet
555
+ valette
556
+ valeur
557
+ vente
558
+ vo
559
+ voix
560
+ voleur
561
+ vous
562
+ zeux
563
+ é
564
+ écai
565
+ échau
566
+ écor
567
+ écueille
568
+ élève
569
+ équelle
570
+ œuf
log_mozilla-foundation_common_voice_8_0_fr_test_predictions.txt ADDED
The diff for this file is too large to render. See raw diff
 
log_mozilla-foundation_common_voice_8_0_fr_test_predictions_greedy.txt ADDED
The diff for this file is too large to render. See raw diff
 
log_mozilla-foundation_common_voice_8_0_fr_test_targets.txt ADDED
The diff for this file is too large to render. See raw diff
 
log_mozilla-foundation_common_voice_8_0_fr_test_targets_greedy.txt ADDED
The diff for this file is too large to render. See raw diff
 
log_speech-recognition-community-v2_dev_data_fr_validation_predictions.txt ADDED
The diff for this file is too large to render. See raw diff
 
log_speech-recognition-community-v2_dev_data_fr_validation_predictions_greedy.txt ADDED
The diff for this file is too large to render. See raw diff
 
log_speech-recognition-community-v2_dev_data_fr_validation_targets.txt ADDED
The diff for this file is too large to render. See raw diff
 
mozilla-foundation_common_voice_8_0_fr_test_eval_results.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ WER: 0.16320660093488012
2
+ CER: 0.04213180710602369
mozilla-foundation_common_voice_8_0_fr_test_eval_results_greedy.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ WER: 0.168549674179772
2
+ CER: 0.046693488978296595
preprocessor_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0,
7
+ "processor_class": "Wav2Vec2ProcessorWithLM",
8
+ "return_attention_mask": true,
9
+ "sampling_rate": 16000
10
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c526b57c0fa8c775ae10130211e8e6ea900c5c3b89e10ad13651f2c35326a482
3
+ size 3850553521
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "<pad>",
5
+ "unk_token": "<unk>"
6
+ }
speech-recognition-community-v2_dev_data_fr_validation_eval_results.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ WER: 0.1716189447145687
2
+ CER: 0.09386097249622839
speech-recognition-community-v2_dev_data_fr_validation_eval_results_greedy.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ WER: 0.22343778137943454
2
+ CER: 0.09883563498510696
tokenizer_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<pad>",
5
+ "lstrip": true,
6
+ "normalized": false,
7
+ "rstrip": true,
8
+ "single_word": false,
9
+ "special": false
10
+ },
11
+ "1": {
12
+ "content": "<s>",
13
+ "lstrip": true,
14
+ "normalized": false,
15
+ "rstrip": true,
16
+ "single_word": false,
17
+ "special": false
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": true,
22
+ "normalized": false,
23
+ "rstrip": true,
24
+ "single_word": false,
25
+ "special": false
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": true,
30
+ "normalized": false,
31
+ "rstrip": true,
32
+ "single_word": false,
33
+ "special": false
34
+ }
35
+ },
36
+ "bos_token": "<s>",
37
+ "clean_up_tokenization_spaces": true,
38
+ "do_lower_case": false,
39
+ "eos_token": "</s>",
40
+ "model_max_length": 1000000000000000019884624838656,
41
+ "pad_token": "<pad>",
42
+ "processor_class": "Wav2Vec2ProcessorWithLM",
43
+ "replace_word_delimiter_char": " ",
44
+ "target_lang": null,
45
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
46
+ "unk_token": "<unk>",
47
+ "word_delimiter_token": "|"
48
+ }
vocab.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "'": 5,
3
+ "-": 6,
4
+ "</s>": 2,
5
+ "<pad>": 0,
6
+ "<s>": 1,
7
+ "<unk>": 3,
8
+ "a": 7,
9
+ "b": 8,
10
+ "c": 9,
11
+ "d": 10,
12
+ "e": 11,
13
+ "f": 12,
14
+ "g": 13,
15
+ "h": 14,
16
+ "i": 15,
17
+ "j": 16,
18
+ "k": 17,
19
+ "l": 18,
20
+ "m": 19,
21
+ "n": 20,
22
+ "o": 21,
23
+ "p": 22,
24
+ "q": 23,
25
+ "r": 24,
26
+ "s": 25,
27
+ "t": 26,
28
+ "u": 27,
29
+ "v": 28,
30
+ "w": 29,
31
+ "x": 30,
32
+ "y": 31,
33
+ "z": 32,
34
+ "|": 4,
35
+ "à": 33,
36
+ "â": 34,
37
+ "ã": 35,
38
+ "ç": 36,
39
+ "è": 37,
40
+ "é": 38,
41
+ "ê": 39,
42
+ "ë": 40,
43
+ "î": 41,
44
+ "ï": 42,
45
+ "ô": 43,
46
+ "ù": 44,
47
+ "û": 45,
48
+ "œ": 46
49
+ }