|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Pangloss datasets for Yongning Na (yong1288) and Japhug (japh1234)""" |
|
|
|
import csv |
|
import json |
|
import os |
|
import datasets |
|
|
|
_CITATION = { |
|
"yong1288": """ |
|
@misc{michaud_alexis_2021_5336698, |
|
author = {Michaud, Alexis and |
|
Galliot, Benjamin and |
|
Guillaume, Séverine}, |
|
title = {{Yongning Na for Natural Language Processing: a |
|
single-speaker audio corpus with transcriptions}}, |
|
month = aug, |
|
year = 2021, |
|
publisher = {Zenodo}, |
|
version = {1.0}, |
|
doi = {10.5281/zenodo.5336698}, |
|
url = {https://doi.org/10.5281/zenodo.5336698} |
|
} |
|
""", |
|
"japh1234": """\ |
|
@misc{jacques_guillaume_2021_5521112, |
|
author = {Jacques, Guillaume and |
|
Galliot, Benjamin and |
|
Guillaume, Séverine}, |
|
title = {{Japhug for Natural Language Processing: a single- |
|
speaker audio corpus with transcriptions}}, |
|
month = sep, |
|
year = 2021, |
|
publisher = {Zenodo}, |
|
version = {1.0}, |
|
doi = {10.5281/zenodo.5521112}, |
|
url = {https://doi.org/10.5281/zenodo.5521112} |
|
} |
|
""" |
|
} |
|
|
|
_DESCRIPTION = """\ |
|
These datasets are extracts from the Pangloss collection and have |
|
been preprocessed for ASR experiments in Na and Japhug. |
|
""" |
|
|
|
_HOMEPAGE = "https://pangloss.cnrs.fr/" |
|
|
|
_LICENSE = "https://creativecommons.org/licenses/by-nc-sa/4.0/fr/legalcode" |
|
|
|
|
|
|
|
|
|
_VERSION = datasets.Version("1.0.0") |
|
|
|
_LANGUAGES = { |
|
"yong1288": { |
|
"url": "https://mycore.core-cloud.net/index.php/s/vaGMeRf4Iij8MWR/download", |
|
"homepage": "https://zenodo.org/record/5336698", |
|
"description": "Yongning Na dataset", |
|
"translations": ["fr", "en", "zh"] |
|
}, |
|
"japh1234": { |
|
"url": "https://mycore.core-cloud.net/index.php/s/kuQCxmyVcUFWroV/download", |
|
"homepage": "https://zenodo.org/record/5521112", |
|
"description": "Japhug dataset", |
|
"translations": ["fr", "zh"] |
|
} |
|
} |
|
|
|
|
|
class PanglossDataset(datasets.GeneratorBasedBuilder): |
|
"""The Pangloss datasets are extracts from Pangloss Collections that can be used for ASR experiments in these languages.""" |
|
field_translations = { |
|
"chemin_audio": "path", |
|
"nature": "doctype", |
|
"forme": "sentence", |
|
"traduction:fr": "translation:fr", |
|
"traduction:en": "translation:en", |
|
"traduction:zh": "translation:zh" |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
BUILDER_CONFIGS = [ |
|
datasets.BuilderConfig(name=language_name, version=_VERSION, description=language_data["description"]) |
|
for language_name, language_data in _LANGUAGES.items() |
|
] |
|
|
|
|
|
|
|
def _info(self): |
|
|
|
features = datasets.Features( |
|
{ |
|
"path": datasets.Value("string"), |
|
"audio": datasets.features.Audio(sampling_rate=16_000), |
|
"sentence": datasets.Value("string"), |
|
"doctype": datasets.Value("string"), |
|
**{f"translation:{language_code}": datasets.Value("string") for language_code in _LANGUAGES[self.config.name]["translations"]} |
|
} |
|
) |
|
|
|
return datasets.DatasetInfo( |
|
|
|
description=_DESCRIPTION, |
|
|
|
features=features, |
|
|
|
|
|
|
|
|
|
homepage=_HOMEPAGE, |
|
|
|
license=_LICENSE, |
|
|
|
citation=_CITATION, |
|
) |
|
|
|
def _split_generators(self, dl_manager): |
|
|
|
|
|
|
|
|
|
|
|
|
|
urls = _LANGUAGES[self.config.name]["url"] |
|
data_dir = dl_manager.download_and_extract(urls) |
|
return [ |
|
datasets.SplitGenerator( |
|
name=datasets.Split.TRAIN, |
|
|
|
gen_kwargs={ |
|
"filepath": os.path.join(data_dir, self.config.name, "train.csv"), |
|
"split": "train" |
|
}, |
|
), |
|
datasets.SplitGenerator( |
|
name=datasets.Split.TEST, |
|
|
|
gen_kwargs={ |
|
"filepath": os.path.join(data_dir, self.config.name, "test.csv"), |
|
"split": "test" |
|
}, |
|
), |
|
datasets.SplitGenerator( |
|
name=datasets.Split.VALIDATION, |
|
|
|
gen_kwargs={ |
|
"filepath": os.path.join(data_dir, self.config.name, "validation.csv"), |
|
"split": "validation" |
|
}, |
|
), |
|
] |
|
|
|
|
|
def _generate_examples(self, filepath, split): |
|
|
|
|
|
with open(filepath, encoding="utf-8") as file_descriptor: |
|
reader = csv.DictReader(file_descriptor) |
|
for key, row in enumerate(reader): |
|
translated_fieldnames = [self.field_translations[fieldname] for fieldname in reader.fieldnames if fieldname in self.field_translations.keys()] |
|
data = dict(zip(translated_fieldnames, row.values())) |
|
data["audio"] = os.path.join(os.path.dirname(filepath), data["path"]) |
|
|
|
yield key, data |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
datasets.load_dataset("pangloss.py", "japh1234") |
|
|
|
|
|
|
|
|
|
|