import csv | |
import json | |
import os | |
import datasets | |
import pandas as pd | |
import numpy as np | |
# TODO: Add BibTeX citation | |
# Find for instance the citation on arxiv or on the dataset repo/website | |
_CITATION = """\ | |
@InProceedings{huggingface:dataset, | |
title = {A great new dataset}, | |
author={huggingface, Inc. | |
}, | |
year={2020} | |
} | |
""" | |
# TODO: Add description of the dataset here | |
# You can copy an official description | |
_DESCRIPTION = """\ | |
This new dataset is designed to solve this great NLP task and is crafted with a lot of care. | |
""" | |
# TODO: Add a link to an official homepage for the dataset here | |
_HOMEPAGE = "" | |
# TODO: Add the licence for the dataset here if you can find it | |
_LICENSE = "" | |
# TODO: Add link to the official dataset URLs here | |
# The HuggingFace dataset library don't host the datasets but only point to the original files | |
# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method) | |
_URLs = { | |
} | |
# TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case | |
class WITDataset(datasets.GeneratorBasedBuilder): | |
"""TODO: Short description of my dataset.""" | |
VERSION = datasets.Version("1.1.0") | |
DEFAULT_CONFIG_NAME = "en" | |
def _info(self): | |
# TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset | |
features = datasets.Features( | |
{ | |
"id": datasets.Value("int64"), | |
"lang": datasets.Value("string"), | |
"caption": datasets.Value("string"), | |
"context": datasets.Value("string"), | |
"image_url": datasets.Value("string"), | |
"page_url": datasets.Value("string"), | |
"image_file": datasets.Value("string"), | |
"pixels_file": datasets.Value("string") | |
# These are the features of your dataset like images, labels ... | |
} | |
) | |
return datasets.DatasetInfo( | |
# This is the description that will appear on the datasets page. | |
description=_DESCRIPTION, | |
# This defines the different columns of the dataset and their types | |
features=features, # Here we define them above because they are different between the two configurations | |
# If there's a common (input, target) tuple from the features, | |
# specify them here. They'll be used if as_supervised=True in | |
# builder.as_dataset. | |
supervised_keys=None, | |
# Homepage of the dataset for documentation | |
homepage=_HOMEPAGE, | |
# License for the dataset if available | |
license=_LICENSE, | |
# Citation for the dataset | |
citation=_CITATION, | |
) | |
def _split_generators(self, dl_manager): | |
"""Returns SplitGenerators.""" | |
# TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration | |
# If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name | |
data_dir = self.config.data_dir | |
return [ | |
datasets.SplitGenerator( | |
name=datasets.Split.TRAIN, | |
# These kwargs will be passed to _generate_examples | |
gen_kwargs={ | |
"data_dir": os.path.join(data_dir, "train"), | |
"split": "train", | |
}, | |
), | |
datasets.SplitGenerator( | |
name=datasets.Split.TEST, | |
# These kwargs will be passed to _generate_examples | |
gen_kwargs={ | |
"data_dir": os.path.join(data_dir, "test"), | |
"split": "test" | |
}, | |
), | |
datasets.SplitGenerator( | |
name=datasets.Split.VALIDATION, | |
# These kwargs will be passed to _generate_examples | |
gen_kwargs={ | |
"data_dir": os.path.join(data_dir, "dev"), | |
"split": "dev", | |
}, | |
), | |
] | |
def _generate_examples( | |
self, data_dir, split # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` | |
): | |
""" Yields examples as (key, example) tuples. """ | |
# This method handles input defined in _split_generators to yield (key, example) tuples from the dataset. | |
# The `key` is here for legacy reason (tfds) and is not important in itself. | |
df = pd.read_csv(os.path.join(data_dir, f'{split}.tsv'), sep='\t') | |
for id_, row in df.iterrows(): | |
_id = row[0] | |
# null caption and context | |
if type(row[4]) != str or type(row[5]) != str: | |
continue | |
image_file = os.path.join(data_dir, 'images', f'{_id}.jpg') | |
pixels_file = os.path.join(data_dir, 'numpy', f'{_id}.npy') | |
yield id_, { | |
"id": row[0], | |
"lang": row[1], | |
"caption": row[4], | |
"context": row[5], | |
"image_url": row[2], | |
"page_url": row[3], | |
"image_file": image_file, | |
"pixels_file": pixels_file | |
} | |