update eval
Browse files
eval.py
CHANGED
@@ -2,6 +2,8 @@
|
|
2 |
import argparse
|
3 |
import functools
|
4 |
import re
|
|
|
|
|
5 |
from typing import Dict
|
6 |
|
7 |
from datasets import Audio, Dataset, DatasetDict, load_dataset, load_metric
|
@@ -50,9 +52,17 @@ def log_results(result: Dataset, args: Dict[str, str]):
|
|
50 |
def normalize_text(text: str) -> str:
|
51 |
"""DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
|
52 |
|
53 |
-
chars_to_ignore_regex = '[
|
54 |
|
55 |
-
text = re.sub(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
# In addition, we can normalize the target text, e.g. removing new lines characters etc...
|
58 |
# note that order is important here!
|
@@ -107,7 +117,7 @@ def main(args):
|
|
107 |
dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
|
108 |
|
109 |
# for testing: only process the first two examples as a test
|
110 |
-
dataset = dataset.select(range(10))
|
111 |
|
112 |
# load processor
|
113 |
feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
|
|
|
2 |
import argparse
|
3 |
import functools
|
4 |
import re
|
5 |
+
import string
|
6 |
+
import unidecode
|
7 |
from typing import Dict
|
8 |
|
9 |
from datasets import Audio, Dataset, DatasetDict, load_dataset, load_metric
|
|
|
52 |
def normalize_text(text: str) -> str:
|
53 |
"""DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
|
54 |
|
55 |
+
chars_to_ignore_regex = f'[{re.escape(string.punctuation)}]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
|
56 |
|
57 |
+
text = re.sub(
|
58 |
+
chars_to_ignore_regex,
|
59 |
+
"",
|
60 |
+
re.sub("['`´]", "’", # elsewhere probably meant as glottal stop
|
61 |
+
re.sub("([og])['`´]", "\g<1>‘", # after o/g indicate modified char
|
62 |
+
unidecode.unidecode(text).lower()
|
63 |
+
)
|
64 |
+
)
|
65 |
+
) + " "
|
66 |
|
67 |
# In addition, we can normalize the target text, e.g. removing new lines characters etc...
|
68 |
# note that order is important here!
|
|
|
117 |
dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
|
118 |
|
119 |
# for testing: only process the first two examples as a test
|
120 |
+
# dataset = dataset.select(range(10))
|
121 |
|
122 |
# load processor
|
123 |
feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
|
mozilla-foundation_common_voice_8_0_uz_test_eval_results.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
WER: 0.4056227604601665
|
2 |
+
CER: 0.082530664990714
|