File size: 5,264 Bytes

a244e91

import csv
import json
import os

import datasets
import pandas as pd
import numpy as np


# TODO: Add BibTeX citation
# Find for instance the citation on arxiv or on the dataset repo/website
_CITATION = """\
@InProceedings{huggingface:dataset,
title = {A great new dataset},
author={huggingface, Inc.
},
year={2020}
}
"""

# TODO: Add description of the dataset here
# You can copy an official description
_DESCRIPTION = """\
This new dataset is designed to solve this great NLP task and is crafted with a lot of care.
"""

# TODO: Add a link to an official homepage for the dataset here
_HOMEPAGE = ""

# TODO: Add the licence for the dataset here if you can find it
_LICENSE = ""

# TODO: Add link to the official dataset URLs here
# The HuggingFace dataset library don't host the datasets but only point to the original files
# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
_URLs = {
}


# TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case
class WITDataset(datasets.GeneratorBasedBuilder):
    """TODO: Short description of my dataset."""

    VERSION = datasets.Version("1.1.0")

    DEFAULT_CONFIG_NAME = "en"

    def _info(self):
        # TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset

        features = datasets.Features(
            {
                "id": datasets.Value("int64"),
                "lang": datasets.Value("string"),
                "caption": datasets.Value("string"),
                "context": datasets.Value("string"),
                "image_url": datasets.Value("string"),
                "page_url": datasets.Value("string"),
                "image_file": datasets.Value("string"),
                "pixels_file": datasets.Value("string")
                # These are the features of your dataset like images, labels ...
            }
        )

        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types
            features=features,  # Here we define them above because they are different between the two configurations
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage=_HOMEPAGE,
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
        # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name

        data_dir = self.config.data_dir

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "data_dir": os.path.join(data_dir, "train"),
                    "split": "train",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "data_dir": os.path.join(data_dir, "test"),
                    "split": "test"
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "data_dir": os.path.join(data_dir, "dev"),
                    "split": "dev",
                },
            ),
        ]

    def _generate_examples(
        self, data_dir, split  # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
    ):
        """ Yields examples as (key, example) tuples. """
        # This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
        # The `key` is here for legacy reason (tfds) and is not important in itself.

        df = pd.read_csv(os.path.join(data_dir, f'{split}.tsv'), sep='\t')

        for id_, row in df.iterrows():

            _id = row[0]

            # null caption and context
            if type(row[4]) != str or type(row[5]) != str:
                continue

            image_file = os.path.join(data_dir, 'images', f'{_id}.jpg')
            pixels_file = os.path.join(data_dir, 'numpy', f'{_id}.npy')

            yield id_, {
                "id": row[0],
                "lang": row[1],
                "caption": row[4],
                "context": row[5],
                "image_url": row[2],
                "page_url": row[3],
                "image_file": image_file,
                "pixels_file": pixels_file
            }