lucio commited on
Commit
e42ea01
1 Parent(s): 60ad77a

update eval

Browse files
eval.py CHANGED
@@ -2,6 +2,8 @@
2
  import argparse
3
  import functools
4
  import re
 
 
5
  from typing import Dict
6
 
7
  from datasets import Audio, Dataset, DatasetDict, load_dataset, load_metric
@@ -50,9 +52,17 @@ def log_results(result: Dataset, args: Dict[str, str]):
50
  def normalize_text(text: str) -> str:
51
  """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
52
 
53
- chars_to_ignore_regex = '[!"%,.:;?\\_|©«¬»،؛؟‒–—’“”„…‹›−☺♂�\\\\-]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
54
 
55
- text = re.sub(chars_to_ignore_regex, "", text.lower())
 
 
 
 
 
 
 
 
56
 
57
  # In addition, we can normalize the target text, e.g. removing new lines characters etc...
58
  # note that order is important here!
@@ -107,7 +117,7 @@ def main(args):
107
  dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
108
 
109
  # for testing: only process the first two examples as a test
110
- dataset = dataset.select(range(10))
111
 
112
  # load processor
113
  feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
 
2
  import argparse
3
  import functools
4
  import re
5
+ import string
6
+ import unidecode
7
  from typing import Dict
8
 
9
  from datasets import Audio, Dataset, DatasetDict, load_dataset, load_metric
 
52
  def normalize_text(text: str) -> str:
53
  """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
54
 
55
+ chars_to_ignore_regex = f'[{re.escape(string.punctuation)}]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
56
 
57
+ text = re.sub(
58
+ chars_to_ignore_regex,
59
+ "",
60
+ re.sub("['`´]", "’", # elsewhere probably meant as glottal stop
61
+ re.sub("([og])['`´]", "\g<1>‘", # after o/g indicate modified char
62
+ unidecode.unidecode(text).lower()
63
+ )
64
+ )
65
+ ) + " "
66
 
67
  # In addition, we can normalize the target text, e.g. removing new lines characters etc...
68
  # note that order is important here!
 
117
  dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
118
 
119
  # for testing: only process the first two examples as a test
120
+ # dataset = dataset.select(range(10))
121
 
122
  # load processor
123
  feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
mozilla-foundation_common_voice_8_0_uz_test_eval_results.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ WER: 0.4056227604601665
2
+ CER: 0.082530664990714