working tokenizer
Browse files- .ipynb_checkpoints/added_tokens-checkpoint.json +1 -1
- .ipynb_checkpoints/eval-checkpoint.py +2 -2
- .ipynb_checkpoints/log_mozilla-foundation_common_voice_8_0_fr_test_predictions-checkpoint.txt +0 -0
- .ipynb_checkpoints/log_mozilla-foundation_common_voice_8_0_fr_test_targets-checkpoint.txt +0 -0
- .ipynb_checkpoints/special_tokens_map-checkpoint.json +1 -1
- .ipynb_checkpoints/tokenizer_config-checkpoint.json +1 -1
- added_tokens.json +1 -1
- eval.py +2 -2
- log_mozilla-foundation_common_voice_8_0_fr_test_predictions.txt +0 -0
- log_mozilla-foundation_common_voice_8_0_fr_test_targets.txt +0 -0
- mozilla-foundation_common_voice_8_0_fr_test_eval_results.txt +2 -2
- special_tokens_map.json +1 -1
- tokenizer_config.json +1 -1
.ipynb_checkpoints/added_tokens-checkpoint.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{}
|
|
|
1 |
+
{"<s>": 216, "</s>": 217, "<pad>": 218}
|
.ipynb_checkpoints/eval-checkpoint.py
CHANGED
@@ -85,7 +85,7 @@ def main(args):
|
|
85 |
dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
|
86 |
|
87 |
# for testing: only process the first two examples as a test
|
88 |
-
|
89 |
|
90 |
# load processor
|
91 |
feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
|
@@ -105,7 +105,7 @@ def main(args):
|
|
105 |
batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
|
106 |
)
|
107 |
|
108 |
-
batch["prediction"] = "".join(prediction["text"].split("<s>"))
|
109 |
batch["target"] = normalize_text(batch["sentence"])
|
110 |
return batch
|
111 |
|
|
|
85 |
dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
|
86 |
|
87 |
# for testing: only process the first two examples as a test
|
88 |
+
dataset = dataset.select(range(2))
|
89 |
|
90 |
# load processor
|
91 |
feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
|
|
|
105 |
batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
|
106 |
)
|
107 |
|
108 |
+
batch["prediction"] = prediction["text"] # "".join(prediction["text"].split("<s>"))
|
109 |
batch["target"] = normalize_text(batch["sentence"])
|
110 |
return batch
|
111 |
|
.ipynb_checkpoints/log_mozilla-foundation_common_voice_8_0_fr_test_predictions-checkpoint.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
.ipynb_checkpoints/log_mozilla-foundation_common_voice_8_0_fr_test_targets-checkpoint.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
.ipynb_checkpoints/special_tokens_map-checkpoint.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"bos_token":
|
|
|
1 |
+
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
|
.ipynb_checkpoints/tokenizer_config-checkpoint.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"unk_token": "<unk>", "bos_token":
|
|
|
1 |
+
{"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
|
added_tokens.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{}
|
|
|
1 |
+
{"<s>": 216, "</s>": 217, "<pad>": 218}
|
eval.py
CHANGED
@@ -85,7 +85,7 @@ def main(args):
|
|
85 |
dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
|
86 |
|
87 |
# for testing: only process the first two examples as a test
|
88 |
-
|
89 |
|
90 |
# load processor
|
91 |
feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
|
@@ -105,7 +105,7 @@ def main(args):
|
|
105 |
batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
|
106 |
)
|
107 |
|
108 |
-
batch["prediction"] = "".join(prediction["text"].split("<s>"))
|
109 |
batch["target"] = normalize_text(batch["sentence"])
|
110 |
return batch
|
111 |
|
|
|
85 |
dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
|
86 |
|
87 |
# for testing: only process the first two examples as a test
|
88 |
+
dataset = dataset.select(range(2))
|
89 |
|
90 |
# load processor
|
91 |
feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
|
|
|
105 |
batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
|
106 |
)
|
107 |
|
108 |
+
batch["prediction"] = prediction["text"] # "".join(prediction["text"].split("<s>"))
|
109 |
batch["target"] = normalize_text(batch["sentence"])
|
110 |
return batch
|
111 |
|
log_mozilla-foundation_common_voice_8_0_fr_test_predictions.txt
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
log_mozilla-foundation_common_voice_8_0_fr_test_targets.txt
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
mozilla-foundation_common_voice_8_0_fr_test_eval_results.txt
CHANGED
@@ -1,2 +1,2 @@
|
|
1 |
-
WER: 0.
|
2 |
-
CER: 0.
|
|
|
1 |
+
WER: 0.0625
|
2 |
+
CER: 0.06382978723404255
|
special_tokens_map.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"bos_token":
|
|
|
1 |
+
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
|
tokenizer_config.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"unk_token": "<unk>", "bos_token":
|
|
|
1 |
+
{"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
|