import csv | |
import json | |
import os | |
import datasets | |
import pandas as pd | |
import numpy as np | |
# TODO: Add BibTeX citation | |
# Find for instance the citation on arxiv or on the dataset repo/website | |
_CITATION = """\ | |
@InProceedings{huggingface:dataset, | |
title = {A great new dataset}, | |
author={huggingface, Inc. | |
}, | |
year={2020} | |
} | |
""" | |
# TODO: Add description of the dataset here | |
# You can copy an official description | |
_DESCRIPTION = """\ | |
This new dataset is designed to solve this great NLP task and is crafted with a lot of care. | |
""" | |
# TODO: Add a link to an official homepage for the dataset here | |
_HOMEPAGE = "" | |
# TODO: Add the licence for the dataset here if you can find it | |
_LICENSE = "" | |
# TODO: Add link to the official dataset URLs here | |
# The HuggingFace dataset library don't host the datasets but only point to the original files | |
# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method) | |
_URLs = { | |
} | |
# TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case | |
class COCODataset(datasets.GeneratorBasedBuilder): | |
"""TODO: Short description of my dataset.""" | |
VERSION = datasets.Version("1.1.0") | |
DEFAULT_CONFIG_NAME = "en" | |
def _info(self): | |
# TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset | |
features = datasets.Features( | |
{ | |
"id": datasets.Value("int64"), | |
"en": datasets.Value("string"), | |
"fr": datasets.Value("string"), | |
"image_id": datasets.Value("int64"), | |
"image_file": datasets.Value("string") | |
# These are the features of your dataset like images, labels ... | |
} | |
) | |
return datasets.DatasetInfo( | |
# This is the description that will appear on the datasets page. | |
description=_DESCRIPTION, | |
# This defines the different columns of the dataset and their types | |
features=features, # Here we define them above because they are different between the two configurations | |
# If there's a common (input, target) tuple from the features, | |
# specify them here. They'll be used if as_supervised=True in | |
# builder.as_dataset. | |
supervised_keys=None, | |
# Homepage of the dataset for documentation | |
homepage=_HOMEPAGE, | |
# License for the dataset if available | |
license=_LICENSE, | |
# Citation for the dataset | |
citation=_CITATION, | |
) | |
def _split_generators(self, dl_manager): | |
"""Returns SplitGenerators.""" | |
# TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration | |
# If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name | |
data_dir = self.config.data_dir | |
return [ | |
datasets.SplitGenerator( | |
name=datasets.Split.TRAIN, | |
# These kwargs will be passed to _generate_examples | |
gen_kwargs={ | |
"data_dir": data_dir, | |
"split": "train", | |
}, | |
), | |
datasets.SplitGenerator( | |
name=datasets.Split.TEST, | |
# These kwargs will be passed to _generate_examples | |
gen_kwargs={ | |
"data_dir": data_dir, | |
"split": "test" | |
}, | |
), | |
datasets.SplitGenerator( | |
name=datasets.Split.VALIDATION, | |
# These kwargs will be passed to _generate_examples | |
gen_kwargs={ | |
"data_dir": data_dir, | |
"split": "val", | |
}, | |
), | |
] | |
def _generate_examples( | |
self, data_dir, split # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` | |
): | |
""" Yields examples as (key, example) tuples. """ | |
# This method handles input defined in _split_generators to yield (key, example) tuples from the dataset. | |
# The `key` is here for legacy reason (tfds) and is not important in itself. | |
# /home/33611/caption/ | |
# train2014 | |
if split == 'dev': | |
split == 'val' | |
with open(os.path.join(data_dir, f'{split}.json')) as fp: | |
examples = json.load(fp) | |
for id_, ex in enumerate(examples): | |
image_id = ex["image_id"] | |
fn = f'COCO_{split}2014_{str(image_id).zfill(12)}.jpg' | |
image_file = os.path.join(data_dir, f'{split}2014', fn) | |
yield id_, { | |
"id": ex["id"], | |
"en": ex["caption"], | |
"fr": ex["fr"], | |
"image_id": ex["image_id"], | |
"image_file": image_file | |
} | |