Plim
/

xls-r-300m-fr

@@ -434,14 +434,11 @@ def main():
     # that make training complicated and do not help in transcribing the speech
     # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
     # that could be easily picked up by the model
-    chars_to_ignore_regex = '[,?.!-;:"“%‘”�—…–=^_`{|}~£§«®°±´µ·º»½×ßáãäìíðñòóõöøýþÿāăąćċčďđēėęěğġħĩī생집]'
     text_column_name = data_args.text_column_name
     def remove_and_replace_special_characters(batch):
-        if chars_to_ignore_regex is not None:
-            batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower().replace('’', "'") + " "
-        else:
-            batch["target_text"] = batch[text_column_name].lower().replace('’', "'") + " "
         return batch
     with training_args.main_process_first(desc="dataset map special characters removal"):

     # that make training complicated and do not help in transcribing the speech
     # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
     # that could be easily picked up by the model
+    chars_to_ignore_regex = '[^a-zàâäçéèêëîïôöùûüÿ\'’ ]'
     text_column_name = data_args.text_column_name
     def remove_and_replace_special_characters(batch):
+        batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name].lower()).replace('’', "'") + " "
         return batch
     with training_args.main_process_first(desc="dataset map special characters removal"):

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:49ea08a4f479298cf334d5e185c3436c6386682d50c2b013bf79ba2880dd7fb2
 size 1263088113

 version https://git-lfs.github.com/spec/v1
+oid sha256:3ec1c7675b56877de46f623ac51149e73d4f88cac691dc72595621d35344ce9b
 size 1263088113

run_speech_recognition_ctc.py CHANGED Viewed

@@ -434,14 +434,11 @@ def main():
     # that make training complicated and do not help in transcribing the speech
     # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
     # that could be easily picked up by the model
-    chars_to_ignore_regex = '[,?.!-;:"“%‘”�—…–=^_`{|}~£§«®°±´µ·º»½×ßáãäìíðñòóõöøýþÿāăąćċčďđēėęěğġħĩī생집]'
     text_column_name = data_args.text_column_name
     def remove_and_replace_special_characters(batch):
-        if chars_to_ignore_regex is not None:
-            batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower().replace('’', "'") + " "
-        else:
-            batch["target_text"] = batch[text_column_name].lower().replace('’', "'") + " "
         return batch
     with training_args.main_process_first(desc="dataset map special characters removal"):

     # that make training complicated and do not help in transcribing the speech
     # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
     # that could be easily picked up by the model
+    chars_to_ignore_regex = '[^a-zàâäçéèêëîïôöùûüÿ\'’ ]'
     text_column_name = data_args.text_column_name
     def remove_and_replace_special_characters(batch):
+        batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name].lower()).replace('’', "'") + " "
         return batch
     with training_args.main_process_first(desc="dataset map special characters removal"):