# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Pangloss datasets for Yongning Na (yong1288) and Japhug (japh1234)""" import csv import json import os import datasets from datasets.tasks import AutomaticSpeechRecognition _CITATION = { "yong1288": """ @misc{michaud_alexis_2021_5336698, author = {Michaud, Alexis and Galliot, Benjamin and Guillaume, Séverine}, title = {{Yongning Na for Natural Language Processing: a single-speaker audio corpus with transcriptions}}, month = aug, year = 2021, publisher = {Zenodo}, version = {1.0}, doi = {10.5281/zenodo.5336698}, url = {https://doi.org/10.5281/zenodo.5336698} } """, "japh1234": """\ @misc{jacques_guillaume_2021_5521112, author = {Jacques, Guillaume and Galliot, Benjamin and Guillaume, Séverine}, title = {{Japhug for Natural Language Processing: a single- speaker audio corpus with transcriptions}}, month = sep, year = 2021, publisher = {Zenodo}, version = {1.0}, doi = {10.5281/zenodo.5521112}, url = {https://doi.org/10.5281/zenodo.5521112} } """ } _DESCRIPTION = """\ These datasets are extracts from the Pangloss collection and have been preprocessed for ASR experiments in Na and Japhug. """ _HOMEPAGE = "https://pangloss.cnrs.fr/" _LICENSE = "https://creativecommons.org/licenses/by-nc-sa/4.0/fr/legalcode" # The HuggingFace Datasets library doesn't host the datasets but only points to the original files. # This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method) _VERSION = datasets.Version("1.0.0") _LANGUAGES = { "yong1288": { "url": "https://mycore.core-cloud.net/index.php/s/vaGMeRf4Iij8MWR/download", "homepage": "https://zenodo.org/record/5336698", "description": "Yongning Na dataset", "translations": ["fr", "en", "zh"] }, "japh1234": { "url": "https://mycore.core-cloud.net/index.php/s/kuQCxmyVcUFWroV/download", "homepage": "https://zenodo.org/record/5521112", "description": "Japhug dataset", "translations": ["fr", "zh"] } } # TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case class PanglossDataset(datasets.GeneratorBasedBuilder): """The Pangloss datasets are extracts from Pangloss Collections that can be used for ASR experiments in these languages.""" field_translations = { "chemin_audio": "path", "nature": "doctype", "forme": "sentence", "traduction:fr": "translation:fr", "traduction:en": "translation:en", "traduction:zh": "translation:zh" } # This is an example of a dataset with multiple configurations. # If you don't want/need to define several sub-sets in your dataset, # just remove the BUILDER_CONFIG_CLASS and the BUILDER_CONFIGS attributes. # If you need to make complex sub-parts in the datasets with configurable options # You can create your own builder configuration class to store attribute, inheriting from datasets.BuilderConfig # BUILDER_CONFIG_CLASS = MyBuilderConfig # You will be able to load one or the other configurations in the following list with # data = datasets.load_dataset('my_dataset', 'first_domain') # data = datasets.load_dataset('my_dataset', 'second_domain') BUILDER_CONFIGS = [ datasets.BuilderConfig(name=language_name, version=_VERSION, description=language_data["description"]) for language_name, language_data in _LANGUAGES.items() ] #DEFAULT_CONFIG_NAME = "na" # It's not mandatory to have a default configuration. Just use one if it make sense. def _info(self): # TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset features = datasets.Features( { "path": datasets.Value("string"), "audio": datasets.features.Audio(sampling_rate=16_000), "sentence": datasets.Value("string"), "doctype": datasets.Value("string"), **{f"translation:{language_code}": datasets.Value("string") for language_code in _LANGUAGES[self.config.name]["translations"]} } ) return datasets.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # This defines the different columns of the dataset and their types features=features, # Here we define them above because they are different between the two configurations # If there's a common (input, target) tuple from the features, uncomment supervised_keys line below and # specify them. They'll be used if as_supervised=True in builder.as_dataset. # supervised_keys=("sentence", "label"), # Homepage of the dataset for documentation homepage=_HOMEPAGE, # License for the dataset if available license=_LICENSE, # Citation for the dataset citation=_CITATION, task_templates=[AutomaticSpeechRecognition(audio_column="audio", transcription_column="forme")], ) def _split_generators(self, dl_manager): # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLS # It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files. # By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive urls = _LANGUAGES[self.config.name]["url"] data_dir = dl_manager.download_and_extract(urls) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={ "filepath": os.path.join(data_dir, self.config.name, "train.csv"), "split": "train" }, ), datasets.SplitGenerator( name=datasets.Split.TEST, # These kwargs will be passed to _generate_examples gen_kwargs={ "filepath": os.path.join(data_dir, self.config.name, "test.csv"), "split": "test" }, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, # These kwargs will be passed to _generate_examples gen_kwargs={ "filepath": os.path.join(data_dir, self.config.name, "validation.csv"), "split": "validation" }, ), ] # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` def _generate_examples(self, filepath, split): # TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset. # The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example. with open(filepath, encoding="utf-8") as file_descriptor: reader = csv.DictReader(file_descriptor) for key, row in enumerate(reader): translated_fieldnames = [self.field_translations[fieldname] for fieldname in reader.fieldnames if fieldname in self.field_translations.keys()] data = dict(zip(translated_fieldnames, row.values())) data["audio"] = os.path.join(os.path.dirname(filepath), data["path"]) # Yields examples as (key, example) tuples yield key, data if __name__ == "__main__": # for language in _LANGUAGES.keys(): datasets.load_dataset("pangloss.py", "japh1234") # datasets-cli test datasets/pangloss --save_infos --all_configs # datasets-cli dummy_data datasets/pangloss --auto_generate