# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Pangloss datasets for Yongning Na (yong1288) and Japhug (japh1234)"""

import csv
import json
import os
import datasets
from datasets.tasks import AutomaticSpeechRecognition

_CITATION = {
    "yong1288": """
@misc{michaud_alexis_2021_5336698,
author       = {Michaud, Alexis and
                Galliot, Benjamin and
                Guillaume, Séverine},
title        = {{Yongning Na for Natural Language Processing: a
                single-speaker audio corpus with transcriptions}},
month        = aug,
year         = 2021,
publisher    = {Zenodo},
version      = {1.0},
doi          = {10.5281/zenodo.5336698},
url          = {https://doi.org/10.5281/zenodo.5336698}
    }
    """,
    "japh1234": """\
@misc{jacques_guillaume_2021_5521112,
author       = {Jacques, Guillaume and
                Galliot, Benjamin and
                Guillaume, Séverine},
title        = {{Japhug for Natural Language Processing: a single-
                speaker audio corpus with transcriptions}},
month        = sep,
year         = 2021,
publisher    = {Zenodo},
version      = {1.0},
doi          = {10.5281/zenodo.5521112},
url          = {https://doi.org/10.5281/zenodo.5521112}
    }
"""
}

_DESCRIPTION = """\
These datasets are extracts from the Pangloss collection and have
been preprocessed for ASR experiments in Na and Japhug.
"""

_HOMEPAGE = "https://pangloss.cnrs.fr/"

_LICENSE = "https://creativecommons.org/licenses/by-nc-sa/4.0/fr/legalcode"

# The HuggingFace Datasets library doesn't host the datasets but only points to the original files.
# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)

_VERSION = datasets.Version("1.0.0")

_LANGUAGES = {
    "yong1288": {
        "url": "https://mycore.core-cloud.net/index.php/s/vaGMeRf4Iij8MWR/download",
        "homepage": "https://zenodo.org/record/5336698",
        "description": "Yongning Na dataset",
        "translations": ["fr", "en", "zh"]
    },
    "japh1234": {
        "url": "https://mycore.core-cloud.net/index.php/s/kuQCxmyVcUFWroV/download",
        "homepage": "https://zenodo.org/record/5521112",
        "description": "Japhug dataset",
        "translations": ["fr", "zh"]
    }
}

# TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case
class PanglossDataset(datasets.GeneratorBasedBuilder):
    """The Pangloss datasets are extracts from Pangloss Collections that can be used for ASR experiments in these languages."""
    field_translations = {
        "chemin_audio": "path",
        "nature": "doctype",
        "forme": "sentence",
        "traduction:fr": "translation:fr",
        "traduction:en": "translation:en",
        "traduction:zh": "translation:zh"
    }

    # This is an example of a dataset with multiple configurations.
    # If you don't want/need to define several sub-sets in your dataset,
    # just remove the BUILDER_CONFIG_CLASS and the BUILDER_CONFIGS attributes.

    # If you need to make complex sub-parts in the datasets with configurable options
    # You can create your own builder configuration class to store attribute, inheriting from datasets.BuilderConfig
    # BUILDER_CONFIG_CLASS = MyBuilderConfig

    # You will be able to load one or the other configurations in the following list with
    # data = datasets.load_dataset('my_dataset', 'first_domain')
    # data = datasets.load_dataset('my_dataset', 'second_domain')
    BUILDER_CONFIGS = [
        datasets.BuilderConfig(name=language_name, version=_VERSION, description=language_data["description"])
        for language_name, language_data in _LANGUAGES.items()
    ]

    #DEFAULT_CONFIG_NAME = "na"  # It's not mandatory to have a default configuration. Just use one if it make sense.

    def _info(self):
        # TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset
        features = datasets.Features(
            {
                "path": datasets.Value("string"),
                "audio": datasets.features.Audio(sampling_rate=16_000),
                "sentence": datasets.Value("string"),
                "doctype": datasets.Value("string"),
                **{f"translation:{language_code}": datasets.Value("string") for language_code in _LANGUAGES[self.config.name]["translations"]}
            }
        )

        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types
            features=features,  # Here we define them above because they are different between the two configurations
            # If there's a common (input, target) tuple from the features, uncomment supervised_keys line below and
            # specify them. They'll be used if as_supervised=True in builder.as_dataset.
            # supervised_keys=("sentence", "label"),
            # Homepage of the dataset for documentation
            homepage=_HOMEPAGE,
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
            task_templates=[AutomaticSpeechRecognition(audio_column="audio", transcription_column="forme")],

        )

    def _split_generators(self, dl_manager):
        # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
        # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name

        # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLS
        # It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files.
        # By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive
        urls = _LANGUAGES[self.config.name]["url"]
        data_dir = dl_manager.download_and_extract(urls)
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": os.path.join(data_dir, self.config.name, "train.csv"),
                    "split": "train"
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": os.path.join(data_dir, self.config.name, "test.csv"),
                    "split": "test"
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": os.path.join(data_dir, self.config.name, "validation.csv"),
                    "split": "validation"
                },
            ),
        ]

    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
    def _generate_examples(self, filepath, split):
        # TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
        # The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example.
        with open(filepath, encoding="utf-8") as file_descriptor:
            reader = csv.DictReader(file_descriptor)
            for key, row in enumerate(reader):
                translated_fieldnames = [self.field_translations[fieldname] for fieldname in reader.fieldnames if fieldname in self.field_translations.keys()]
                data = dict(zip(translated_fieldnames, row.values()))
                data["audio"] = os.path.join(os.path.dirname(filepath), data["path"])
                # Yields examples as (key, example) tuples
                yield key, data


if __name__ == "__main__":
    # for language in _LANGUAGES.keys():
    datasets.load_dataset("pangloss.py", "japh1234")

# datasets-cli test datasets/pangloss --save_infos --all_configs
# datasets-cli dummy_data datasets/pangloss --auto_generate