|
""" |
|
Data preparation. |
|
Download: https://voice.mozilla.org/en/datasets |
|
Author |
|
------ |
|
Titouan Parcollet |
|
Luca Della Libera 2022 |
|
Pooneh Mousavi 2022 |
|
""" |
|
|
|
from dataclasses import dataclass |
|
import os |
|
import csv |
|
import re |
|
import logging |
|
import torchaudio |
|
from tqdm import tqdm |
|
import unicodedata |
|
import functools |
|
torchaudio.set_audio_backend("soundfile") |
|
from speechbrain.utils.parallel import parallel_map |
|
from speechbrain.dataio.dataio import read_audio_info |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
def prepare_common_voice( |
|
data_folder, |
|
save_folder, |
|
train_tsv_file=None, |
|
dev_tsv_file=None, |
|
test_tsv_file=None, |
|
accented_letters=False, |
|
language="en", |
|
skip_prep=False, |
|
): |
|
""" |
|
Prepares the csv files for the Mozilla Common Voice dataset. |
|
Download: https://voice.mozilla.org/en/datasets |
|
Arguments |
|
--------- |
|
data_folder : str |
|
Path to the folder where the original Common Voice dataset is stored. |
|
This path should include the lang: /datasets/CommonVoice/<language>/ |
|
save_folder : str |
|
The directory where to store the csv files. |
|
train_tsv_file : str, optional |
|
Path to the Train Common Voice .tsv file (cs) |
|
dev_tsv_file : str, optional |
|
Path to the Dev Common Voice .tsv file (cs) |
|
test_tsv_file : str, optional |
|
Path to the Test Common Voice .tsv file (cs) |
|
accented_letters : bool, optional |
|
Defines if accented letters will be kept as individual letters or |
|
transformed to the closest non-accented letters. |
|
language: str |
|
Specify the language for text normalization. |
|
skip_prep: bool |
|
If True, skip data preparation. |
|
Example |
|
------- |
|
>>> from recipes.CommonVoice.common_voice_prepare import prepare_common_voice |
|
>>> data_folder = '/datasets/CommonVoice/en' |
|
>>> save_folder = 'exp/CommonVoice_exp' |
|
>>> train_tsv_file = '/datasets/CommonVoice/en/train.tsv' |
|
>>> dev_tsv_file = '/datasets/CommonVoice/en/dev.tsv' |
|
>>> test_tsv_file = '/datasets/CommonVoice/en/test.tsv' |
|
>>> accented_letters = False |
|
>>> duration_threshold = 10 |
|
>>> prepare_common_voice( \ |
|
data_folder, \ |
|
save_folder, \ |
|
train_tsv_file, \ |
|
dev_tsv_file, \ |
|
test_tsv_file, \ |
|
accented_letters, \ |
|
language="en" \ |
|
) |
|
""" |
|
|
|
if skip_prep: |
|
return |
|
|
|
|
|
if train_tsv_file is None: |
|
train_tsv_file = data_folder + "/train.tsv" |
|
else: |
|
train_tsv_file = train_tsv_file |
|
|
|
if dev_tsv_file is None: |
|
dev_tsv_file = data_folder + "/dev.tsv" |
|
else: |
|
dev_tsv_file = dev_tsv_file |
|
|
|
if test_tsv_file is None: |
|
test_tsv_file = data_folder + "/test.tsv" |
|
else: |
|
test_tsv_file = test_tsv_file |
|
|
|
|
|
if not os.path.exists(save_folder): |
|
os.makedirs(save_folder) |
|
|
|
|
|
save_csv_train = save_folder + "/train.csv" |
|
save_csv_dev = save_folder + "/dev.csv" |
|
save_csv_test = save_folder + "/test.csv" |
|
|
|
|
|
if skip(save_csv_train, save_csv_dev, save_csv_test): |
|
|
|
msg = "%s already exists, skipping data preparation!" % (save_csv_train) |
|
logger.info(msg) |
|
|
|
msg = "%s already exists, skipping data preparation!" % (save_csv_dev) |
|
logger.info(msg) |
|
|
|
msg = "%s already exists, skipping data preparation!" % (save_csv_test) |
|
logger.info(msg) |
|
|
|
return |
|
|
|
|
|
check_commonvoice_folders(data_folder) |
|
|
|
file_pairs = zip( |
|
[train_tsv_file, dev_tsv_file, test_tsv_file], |
|
[save_csv_train, save_csv_dev, save_csv_test], |
|
) |
|
for tsv_file, save_csv in file_pairs: |
|
create_csv( |
|
tsv_file, save_csv, data_folder, accented_letters, language, |
|
) |
|
|
|
|
|
def skip(save_csv_train, save_csv_dev, save_csv_test): |
|
""" |
|
Detects if the Common Voice data preparation has been already done. |
|
If the preparation has been done, we can skip it. |
|
Returns |
|
------- |
|
bool |
|
if True, the preparation phase can be skipped. |
|
if False, it must be done. |
|
""" |
|
|
|
|
|
skip = False |
|
|
|
if ( |
|
os.path.isfile(save_csv_train) |
|
and os.path.isfile(save_csv_dev) |
|
and os.path.isfile(save_csv_test) |
|
): |
|
skip = True |
|
|
|
return skip |
|
|
|
|
|
@dataclass |
|
class CVRow: |
|
snt_id: str |
|
duration: float |
|
mp3_path: str |
|
spk_id: str |
|
words: str |
|
|
|
|
|
def process_line(line, data_folder, language, accented_letters): |
|
|
|
|
|
mp3_path = data_folder + "/clips/" + line.split("\t")[1] |
|
file_name = mp3_path.split(".")[-2].split("/")[-1] |
|
spk_id = line.split("\t")[0] |
|
snt_id = file_name |
|
|
|
|
|
""" |
|
if torchaudio.get_audio_backend() != "sox_io": |
|
logger.warning("This recipe needs the sox-io backend of torchaudio") |
|
logger.warning("The torchaudio backend is changed to sox_io") |
|
torchaudio.set_audio_backend("sox_io") |
|
""" |
|
|
|
if os.path.isfile(mp3_path): |
|
info = read_audio_info(mp3_path) |
|
else: |
|
msg = "\tError loading: %s" % (str(len(file_name))) |
|
logger.info(msg) |
|
return None |
|
|
|
duration = info.num_frames / info.sample_rate |
|
|
|
|
|
words = line.split("\t")[2] |
|
|
|
|
|
words = unicode_normalisation(words) |
|
|
|
|
|
words = language_specific_preprocess(language, words) |
|
|
|
|
|
if not accented_letters: |
|
words = strip_accents(words) |
|
words = words.replace("'", " ") |
|
words = words.replace("’", " ") |
|
|
|
|
|
words = re.sub(" +", " ", words) |
|
|
|
|
|
words = words.lstrip().rstrip() |
|
|
|
|
|
chars = words.replace(" ", "_") |
|
chars = " ".join([char for char in chars][:]) |
|
|
|
|
|
if language in ["ja", "ch"]: |
|
if len(chars) < 3: |
|
return None |
|
else: |
|
if len(words.split(" ")) < 3: |
|
return None |
|
|
|
|
|
return CVRow(snt_id, duration, mp3_path, spk_id, words) |
|
|
|
|
|
def create_csv( |
|
orig_tsv_file, csv_file, data_folder, accented_letters=False, language="en" |
|
): |
|
""" |
|
Creates the csv file given a list of wav files. |
|
Arguments |
|
--------- |
|
orig_tsv_file : str |
|
Path to the Common Voice tsv file (standard file). |
|
data_folder : str |
|
Path of the CommonVoice dataset. |
|
accented_letters : bool, optional |
|
Defines if accented letters will be kept as individual letters or |
|
transformed to the closest non-accented letters. |
|
Returns |
|
------- |
|
None |
|
""" |
|
|
|
|
|
if not os.path.isfile(orig_tsv_file): |
|
msg = "\t%s doesn't exist, verify your dataset!" % (orig_tsv_file) |
|
logger.info(msg) |
|
raise FileNotFoundError(msg) |
|
|
|
|
|
loaded_csv = open(orig_tsv_file, "r").readlines()[1:] |
|
nb_samples = len(loaded_csv) |
|
|
|
msg = "Preparing CSV files for %s samples ..." % (str(nb_samples)) |
|
logger.info(msg) |
|
|
|
|
|
msg = "Creating csv lists in %s ..." % (csv_file) |
|
logger.info(msg) |
|
|
|
|
|
total_duration = 0.0 |
|
|
|
line_processor = functools.partial( |
|
process_line, |
|
data_folder=data_folder, |
|
language=language, |
|
accented_letters=accented_letters, |
|
) |
|
|
|
|
|
csv_file_tmp = csv_file + ".tmp" |
|
|
|
with open(csv_file_tmp, mode="w", encoding="utf-8") as csv_f: |
|
csv_writer = csv.writer( |
|
csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL |
|
) |
|
|
|
csv_writer.writerow(["ID", "duration", "wav", "spk_id", "wrd"]) |
|
for line in tqdm(loaded_csv) : |
|
|
|
row = line_processor(line) |
|
if row is not None : |
|
total_duration += row.duration |
|
csv_writer.writerow( |
|
[ |
|
row.snt_id, |
|
str(row.duration), |
|
row.mp3_path, |
|
row.spk_id, |
|
row.words, |
|
] |
|
) |
|
|
|
os.replace(csv_file_tmp, csv_file) |
|
|
|
|
|
msg = "%s successfully created!" % (csv_file) |
|
logger.info(msg) |
|
msg = "Number of samples: %s " % (str(len(loaded_csv))) |
|
logger.info(msg) |
|
msg = "Total duration: %s Hours" % (str(round(total_duration / 3600, 2))) |
|
logger.info(msg) |
|
|
|
|
|
def language_specific_preprocess(language, words): |
|
|
|
|
|
|
|
|
|
if language in ["en", "fr", "it", "rw"]: |
|
words = re.sub( |
|
"[^’'A-Za-z0-9À-ÖØ-öø-ÿЀ-ӿéæœâçèàûî]+", " ", words |
|
).upper() |
|
|
|
if language == "de": |
|
|
|
|
|
|
|
words = words.replace("ß", "0000ß0000") |
|
words = re.sub("[^’'A-Za-z0-9öÖäÄüÜß]+", " ", words).upper() |
|
words = words.replace("'", " ") |
|
words = words.replace("’", " ") |
|
words = words.replace( |
|
"0000SS0000", "ß" |
|
) |
|
|
|
if language == "fr": |
|
|
|
words = words.replace("'", " ") |
|
words = words.replace("’", " ") |
|
|
|
elif language == "ar": |
|
HAMZA = "\u0621" |
|
ALEF_MADDA = "\u0622" |
|
ALEF_HAMZA_ABOVE = "\u0623" |
|
letters = ( |
|
"ابتةثجحخدذرزژشسصضطظعغفقكلمنهويىءآأؤإئ" |
|
+ HAMZA |
|
+ ALEF_MADDA |
|
+ ALEF_HAMZA_ABOVE |
|
) |
|
words = re.sub("[^" + letters + " ]+", "", words).upper() |
|
elif language == "fa": |
|
HAMZA = "\u0621" |
|
ALEF_MADDA = "\u0622" |
|
ALEF_HAMZA_ABOVE = "\u0623" |
|
letters = ( |
|
"ابپتةثجحخچدذرزژسشصضطظعغفقگکلمنهویىءآأؤإئ" |
|
+ HAMZA |
|
+ ALEF_MADDA |
|
+ ALEF_HAMZA_ABOVE |
|
) |
|
words = re.sub("[^" + letters + " ]+", "", words).upper() |
|
elif language == "ga-IE": |
|
|
|
def pfxuc(a): |
|
return len(a) >= 2 and a[0] in "tn" and a[1] in "AEIOUÁÉÍÓÚ" |
|
|
|
def galc(w): |
|
return w.lower() if not pfxuc(w) else w[0] + "-" + w[1:].lower() |
|
|
|
words = re.sub("[^-A-Za-z'ÁÉÍÓÚáéíóú]+", " ", words) |
|
words = " ".join(map(galc, words.split(" "))) |
|
elif language == "es": |
|
|
|
|
|
words = words.replace("$", "s") |
|
return words |
|
|
|
|
|
def check_commonvoice_folders(data_folder): |
|
""" |
|
Check if the data folder actually contains the Common Voice dataset. |
|
If not, raises an error. |
|
Returns |
|
------- |
|
None |
|
Raises |
|
------ |
|
FileNotFoundError |
|
If data folder doesn't contain Common Voice dataset. |
|
""" |
|
files_str = "/clips" |
|
|
|
if not os.path.exists(data_folder + files_str): |
|
err_msg = ( |
|
"the folder %s does not exist (it is expected in " |
|
"the Common Voice dataset)" % (data_folder + files_str) |
|
) |
|
raise FileNotFoundError(err_msg) |
|
|
|
|
|
def unicode_normalisation(text): |
|
return str(text) |
|
|
|
|
|
def strip_accents(text): |
|
text = ( |
|
unicodedata.normalize("NFD", text) |
|
.encode("ascii", "ignore") |
|
.decode("utf-8") |
|
) |
|
return str(text) |
|
|