NbAiLab
/

wav2vec2-1b-npsc-nst-bokmaal

@@ -5,6 +5,7 @@ from typing import Dict
 import torch
 from datasets import Audio, Dataset, load_dataset, load_metric
 from transformers import AutoFeatureExtractor, AutoModelForCTC, pipeline, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM, Wav2Vec2FeatureExtractor
 # from pyctcdecode import BeamSearchDecoderCTC
@@ -57,7 +58,7 @@ def normalize_text(text: str, dataset: str) -> str:
     if dataset.lower().endswith("nst"):
         text = text.lower()
-        text = text.replace("(...Vær stille under dette opptaket...)", "")
         text = re.sub('[áàâ]', 'a', text)
         text = re.sub('[ä]', 'æ', text)
         text = re.sub('[éèëê]', 'e', text)
@@ -77,7 +78,18 @@ def normalize_text(text: str, dataset: str) -> str:
         text = re.sub('[ö]', 'ø', text)
         text = re.sub('[ç]', 'c', text)
         text = re.sub('[úùüû]', 'u', text)
-        text = re.sub('\s', ' ', text)
     text = re.sub("<ee(eh)?>", "e", text)
     text = re.sub("<mmm?>", "m", text)
     text = re.sub("<qq>", "q", text)
@@ -140,8 +152,8 @@ def main(args):
             batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
         )
-        batch["prediction"] = prediction["text"]
-        batch["target"] = normalize_text(batch["text"], args.dataset)
         return batch
     # run inference on all examples
@@ -168,6 +180,9 @@ if __name__ == "__main__":
         "--config", type=str, required=True, help="Config of the dataset. *E.g.* `'en'`  for Common Voice"
     )
     parser.add_argument("--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`")
     parser.add_argument(
         "--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to 5 seconds."
     )

 import torch
 from datasets import Audio, Dataset, load_dataset, load_metric
+from num2words import num2words as n2w
 from transformers import AutoFeatureExtractor, AutoModelForCTC, pipeline, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM, Wav2Vec2FeatureExtractor
 # from pyctcdecode import BeamSearchDecoderCTC
     if dataset.lower().endswith("nst"):
         text = text.lower()
+        text = text.replace("(...vær stille under dette opptaket...)", "")
         text = re.sub('[áàâ]', 'a', text)
         text = re.sub('[ä]', 'æ', text)
         text = re.sub('[éèëê]', 'e', text)
         text = re.sub('[ö]', 'ø', text)
         text = re.sub('[ç]', 'c', text)
         text = re.sub('[úùüû]', 'u', text)
+        text = re.sub('\s+', ' ', text)
+    elif dataset.lower().endswith("fleurs"):
+        text = re.sub('[áàâ]', 'a', text)
+        text = re.sub('[ä]', 'æ', text)
+        text = re.sub('[éèëê]', 'e', text)
+        text = re.sub('[íìïî]', 'i', text)
+        text = re.sub('[óòöô]', 'o', text)
+        text = re.sub('[ö]', 'ø', text)
+        text = re.sub('[ç]', 'c', text)
+        text = re.sub('[úùüû]', 'u', text)
+        text = re.compile(r"-?[1-9][\d.]*").sub(lambda x: n2w(x.group(0), lang="no"), text)
+        text = re.sub('\s+', ' ', text)
     text = re.sub("<ee(eh)?>", "e", text)
     text = re.sub("<mmm?>", "m", text)
     text = re.sub("<qq>", "q", text)
             batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
         )
+        batch["prediction"] = prediction[args.text_column]
+        batch["target"] = normalize_text(args.text_column, args.dataset)
         return batch
     # run inference on all examples
         "--config", type=str, required=True, help="Config of the dataset. *E.g.* `'en'`  for Common Voice"
     )
     parser.add_argument("--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`")
+    parser.add_argument(
+        "--text_column", type=str, default="text", help="Column name containing the transcription."
+    )
     parser.add_argument(
         "--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to 5 seconds."
     )