|
import csv |
|
import json |
|
import os |
|
|
|
import datasets |
|
import pandas as pd |
|
import numpy as np |
|
|
|
|
|
class ImageCaptionBuilderConfig(datasets.BuilderConfig): |
|
|
|
def __init__(self, name, splits, langs, prefix_before_image_fn=False, zfill=1, **kwargs): |
|
|
|
super().__init__(name, **kwargs) |
|
|
|
self.splits = splits |
|
self.langs = langs |
|
self.prefix_before_image_fn = prefix_before_image_fn |
|
self.zfill = zfill |
|
|
|
|
|
|
|
|
|
_CITATION = """\ |
|
@InProceedings{None, |
|
title = {Generic images to captions dataset}, |
|
author={Yih-Dar SHIEH}, |
|
year={2020} |
|
} |
|
""" |
|
|
|
|
|
|
|
_DESCRIPTION = """\ |
|
|
|
""" |
|
|
|
|
|
_HOMEPAGE = "" |
|
|
|
|
|
_LICENSE = "" |
|
|
|
|
|
|
|
|
|
_URLs = {} |
|
|
|
|
|
|
|
class ImageCaptionDataset(datasets.GeneratorBasedBuilder): |
|
"""TODO: Short description of my dataset.""" |
|
|
|
VERSION = datasets.Version("0.0.0") |
|
|
|
BUILDER_CONFIG_CLASS = ImageCaptionBuilderConfig |
|
BUILDER_CONFIGS = [ |
|
ImageCaptionBuilderConfig(name='coco_2017', splits=['train', 'valid'], prefix_before_image_fn=False, zfill=12, langs=['en', 'fr']), |
|
ImageCaptionBuilderConfig(name='cc3m', splits=['train', 'valid'], prefix_before_image_fn=True, zfill=8, langs=['en', 'fr']), |
|
ImageCaptionBuilderConfig(name='cc12m', splits=['train', 'valid'], prefix_before_image_fn=True, zfill=8, langs=['en', 'fr']) |
|
] |
|
DEFAULT_CONFIG_NAME = "coco_2017" |
|
|
|
def _info(self): |
|
|
|
|
|
feature_dict = { |
|
"image_id": datasets.Value("int64"), |
|
"id": datasets.Value("int64"), |
|
"caption": datasets.Value("string"), |
|
} |
|
for lang in self.config.langs: |
|
feature_dict[lang] = datasets.Value("string") |
|
feature_dict["image_url"] = datasets.Value("string") |
|
feature_dict["image_file"] = datasets.Value("string") |
|
|
|
features = datasets.Features(feature_dict) |
|
|
|
return datasets.DatasetInfo( |
|
|
|
description=_DESCRIPTION, |
|
|
|
features=features, |
|
|
|
|
|
|
|
supervised_keys=None, |
|
|
|
homepage=_HOMEPAGE, |
|
|
|
license=_LICENSE, |
|
|
|
citation=_CITATION, |
|
) |
|
|
|
def _split_generators(self, dl_manager): |
|
"""Returns SplitGenerators.""" |
|
|
|
|
|
|
|
data_dir = self.config.data_dir |
|
|
|
splits = [] |
|
for split in self.config.splits: |
|
if split == 'train': |
|
dataset = datasets.SplitGenerator( |
|
name=datasets.Split.TRAIN, |
|
|
|
gen_kwargs={ |
|
"jsonl_dir": os.path.join(data_dir, f'{self.config.name}_jsonls', 'train'), |
|
"image_dir": os.path.join(data_dir, f'{self.config.name}_images', 'train'), |
|
"split": "train", |
|
} |
|
) |
|
elif split in ['val', 'valid', 'validation', 'dev']: |
|
dataset = datasets.SplitGenerator( |
|
name=datasets.Split.VALIDATION, |
|
|
|
gen_kwargs={ |
|
"jsonl_dir": os.path.join(data_dir, f'{self.config.name}_jsonls', 'valid'), |
|
"image_dir": os.path.join(data_dir, f'{self.config.name}_images', 'valid'), |
|
"split": "valid", |
|
}, |
|
) |
|
elif split == 'test': |
|
dataset = datasets.SplitGenerator( |
|
name=datasets.Split.TEST, |
|
|
|
gen_kwargs={ |
|
"jsonl_dir": os.path.join(data_dir, f'{self.config.name}_jsonls', 'test'), |
|
"image_dir": os.path.join(data_dir, f'{self.config.name}_images', 'test'), |
|
"split": "test", |
|
}, |
|
) |
|
else: |
|
continue |
|
|
|
splits.append(dataset) |
|
|
|
return splits |
|
|
|
def _generate_examples( |
|
|
|
self, jsonl_dir, image_dir, split |
|
): |
|
""" Yields examples as (key, example) tuples. """ |
|
|
|
|
|
|
|
if split == 'dev': |
|
split = 'valid' |
|
|
|
fns = [os.path.join(jsonl_dir, fn) for fn in os.listdir(jsonl_dir) if os.path.isfile(os.path.join(jsonl_dir, fn)) and fn.endswith("jsonl")] |
|
|
|
for jsonl_file in fns: |
|
|
|
with open(jsonl_file, 'r', encoding='UTF-8') as fp: |
|
|
|
for id_, line in enumerate(fp): |
|
|
|
ex = json.loads(line) |
|
|
|
example = { |
|
"image_id": ex['image_id'], |
|
"id": ex["id"], |
|
"caption": ex["caption"], |
|
} |
|
|
|
for lang in self.config.langs: |
|
example[lang] = ex[lang] |
|
|
|
if 'image_url' in ex: |
|
example['image_url'] = ex['image_url'] |
|
else: |
|
example['image_url'] = '' |
|
|
|
fn = f'{str(ex["image_id"]).zfill(self.config.zfill)}.jpg' |
|
if self.config.prefix_before_image_fn: |
|
fn = f'{self.config.name}_{split}_' + fn |
|
|
|
image_file = os.path.join(image_dir, fn) |
|
example['image_file'] = image_file |
|
|
|
if not os.path.isfile(image_file): |
|
continue |
|
|
|
yield id_, example |
|
|