working tokenizer

Browse files

Files changed (13) hide show

.ipynb_checkpoints/added_tokens-checkpoint.json +1 -1
.ipynb_checkpoints/eval-checkpoint.py +2 -2
.ipynb_checkpoints/log_mozilla-foundation_common_voice_8_0_fr_test_predictions-checkpoint.txt +0 -0
.ipynb_checkpoints/log_mozilla-foundation_common_voice_8_0_fr_test_targets-checkpoint.txt +0 -0
.ipynb_checkpoints/special_tokens_map-checkpoint.json +1 -1
.ipynb_checkpoints/tokenizer_config-checkpoint.json +1 -1
added_tokens.json +1 -1
eval.py +2 -2
log_mozilla-foundation_common_voice_8_0_fr_test_predictions.txt +0 -0
log_mozilla-foundation_common_voice_8_0_fr_test_targets.txt +0 -0
mozilla-foundation_common_voice_8_0_fr_test_eval_results.txt +2 -2
special_tokens_map.json +1 -1
tokenizer_config.json +1 -1

.ipynb_checkpoints/added_tokens-checkpoint.json CHANGED Viewed

	@@ -1 +1 @@
1	- {}


1	+ {"<s>": 216, "</s>": 217, "<pad>": 218}

.ipynb_checkpoints/eval-checkpoint.py CHANGED Viewed

@@ -85,7 +85,7 @@ def main(args):
     dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
     # for testing: only process the first two examples as a test
-    #dataset = dataset.select(range(20))
     # load processor
     feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
@@ -105,7 +105,7 @@ def main(args):
             batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
         )
-        batch["prediction"] = "".join(prediction["text"].split("<s>"))
         batch["target"] = normalize_text(batch["sentence"])
         return batch

     dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
     # for testing: only process the first two examples as a test
+    dataset = dataset.select(range(2))
     # load processor
     feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
             batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
         )
+        batch["prediction"] = prediction["text"]  # "".join(prediction["text"].split("<s>"))
         batch["target"] = normalize_text(batch["sentence"])
         return batch

.ipynb_checkpoints/log_mozilla-foundation_common_voice_8_0_fr_test_predictions-checkpoint.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

.ipynb_checkpoints/log_mozilla-foundation_common_voice_8_0_fr_test_targets-checkpoint.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

.ipynb_checkpoints/special_tokens_map-checkpoint.json CHANGED Viewed

	@@ -1 +1 @@
1	- {"bos_token": ~~null~~, "eos_token": ~~null~~, "unk_token": "<unk>", "pad_token": "<pad>"}

+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}

.ipynb_checkpoints/tokenizer_config-checkpoint.json CHANGED Viewed

	@@ -1 +1 @@
1	- {"unk_token": "<unk>", "bos_token": ~~null~~, "eos_token": ~~null~~, "pad_token": "<pad>", "do_lower_case": false, "word_delimiter_token": "\|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "~~padding": true, "truncation": true, "~~tokenizer_class": "Wav2Vec2CTCTokenizer"}


1	+ {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>", "do_lower_case": false, "word_delimiter_token": "\|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "tokenizer_class": "Wav2Vec2CTCTokenizer"}

added_tokens.json CHANGED Viewed

	@@ -1 +1 @@
1	- {}


1	+ {"<s>": 216, "</s>": 217, "<pad>": 218}

eval.py CHANGED Viewed

@@ -85,7 +85,7 @@ def main(args):
     dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
     # for testing: only process the first two examples as a test
-    #dataset = dataset.select(range(20))
     # load processor
     feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
@@ -105,7 +105,7 @@ def main(args):
             batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
         )
-        batch["prediction"] = "".join(prediction["text"].split("<s>"))
         batch["target"] = normalize_text(batch["sentence"])
         return batch

     dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
     # for testing: only process the first two examples as a test
+    dataset = dataset.select(range(2))
     # load processor
     feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
             batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
         )
+        batch["prediction"] = prediction["text"]  # "".join(prediction["text"].split("<s>"))
         batch["target"] = normalize_text(batch["sentence"])
         return batch

log_mozilla-foundation_common_voice_8_0_fr_test_predictions.txt CHANGED Viewed

The diff for this file is too large to render. See raw diff

log_mozilla-foundation_common_voice_8_0_fr_test_targets.txt CHANGED Viewed

The diff for this file is too large to render. See raw diff

mozilla-foundation_common_voice_8_0_fr_test_eval_results.txt CHANGED Viewed

	@@ -1,2 +1,2 @@
1	- WER: 0.~~21587470509795875~~
2	- CER: 0.~~06356032070032196~~


1	+ WER: 0.0625
2	+ CER: 0.06382978723404255

special_tokens_map.json CHANGED Viewed

	@@ -1 +1 @@
1	- {"bos_token": ~~null~~, "eos_token": ~~null~~, "unk_token": "<unk>", "pad_token": "<pad>"}

tokenizer_config.json CHANGED Viewed

	@@ -1 +1 @@
1	- {"unk_token": "<unk>", "bos_token": ~~null~~, "eos_token": ~~null~~, "pad_token": "<pad>", "do_lower_case": false, "word_delimiter_token": "\|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "~~padding": true, "truncation": true, "~~tokenizer_class": "Wav2Vec2CTCTokenizer"}


1	+ {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>", "do_lower_case": false, "word_delimiter_token": "\|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "tokenizer_class": "Wav2Vec2CTCTokenizer"}