JustinLin610
update
8437114
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import logging
import os
from dataclasses import dataclass, field
from typing import List, Optional, Tuple
import torch
from fairseq import utils
from fairseq.data import (
Dictionary,
TokenBlockDataset,
data_utils,
iterators,
)
from fairseq.dataclass import FairseqDataclass
from fairseq.distributed import utils as dist_utils
from fairseq.tasks import FairseqTask, register_task
from omegaconf import II
logger = logging.getLogger(__name__)
@dataclass
class TruncatedBPTTLMConfig(FairseqDataclass):
data: str = field(default="???", metadata={"help": "path to data directory"})
tokens_per_sample: int = field(
default=1024,
metadata={"help": "max number of tokens per sequence"},
)
batch_size: int = II("dataset.batch_size")
# Some models use *max_target_positions* to know how many positional
# embeddings to learn. We use II(...) to make it default to
# *tokens_per_sample*, but in principle there could be more positional
# embeddings than tokens in a single batch. This may also be irrelevant for
# custom model implementations.
max_target_positions: int = II("task.tokens_per_sample")
# these will be populated automatically if not provided
data_parallel_rank: Optional[int] = None
data_parallel_size: Optional[int] = None
@register_task("truncated_bptt_lm", dataclass=TruncatedBPTTLMConfig)
class TruncatedBPTTLMTask(FairseqTask):
def __init__(self, cfg: TruncatedBPTTLMConfig):
super().__init__(cfg)
if cfg.data_parallel_rank is None or cfg.data_parallel_size is None:
if torch.distributed.is_initialized():
cfg.data_parallel_rank = dist_utils.get_data_parallel_rank()
cfg.data_parallel_size = dist_utils.get_data_parallel_world_size()
else:
cfg.data_parallel_rank = 0
cfg.data_parallel_size = 1
# load the dictionary
paths = utils.split_paths(cfg.data)
assert len(paths) > 0
self.dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt"))
logger.info("dictionary: {} types".format(len(self.dictionary)))
def load_dataset(self, split, epoch=1, combine=False, **kwargs):
"""Load a given dataset split (e.g., train, valid, test)"""
# support sharded datasets
paths = utils.split_paths(self.cfg.data)
assert len(paths) > 0
data_path = paths[(epoch - 1) % len(paths)]
split_path = os.path.join(data_path, split)
# each element of *data* will be a tensorized line from the original
# text dataset, similar to ``open(split_path).readlines()``
data = data_utils.load_indexed_dataset(
split_path, self.dictionary, combine=combine
)
if data is None:
raise FileNotFoundError(
"Dataset not found: {} ({})".format(split, split_path)
)
# this is similar to ``data.view(-1).split(tokens_per_sample)``
data = TokenBlockDataset(
data,
data.sizes,
block_size=self.cfg.tokens_per_sample,
pad=None, # unused
eos=None, # unused
break_mode="none",
)
self.datasets[split] = TruncatedBPTTDataset(
data=data,
bsz_per_shard=self.cfg.batch_size,
shard_id=self.cfg.data_parallel_rank,
num_shards=self.cfg.data_parallel_size,
)
def dataset(self, split):
return self.datasets[split]
def get_batch_iterator(
self, dataset, num_workers=0, epoch=1, data_buffer_size=0, **kwargs
):
return iterators.EpochBatchIterator(
dataset=dataset,
collate_fn=self._collate_fn,
num_workers=num_workers,
epoch=epoch,
buffer_size=data_buffer_size,
# we don't use the batching functionality from EpochBatchIterator;
# instead every item in *dataset* is a whole batch
batch_sampler=[[i] for i in range(len(dataset))],
disable_shuffling=True,
)
def _collate_fn(self, items: List[List[torch.Tensor]]):
# we don't use fairseq's batching functionality, so we expect a single
# Tensor of type List[torch.Tensor]
assert len(items) == 1
# item will have shape B x T (the last batch may have length < T)
id, item = items[0]
item = data_utils.collate_tokens(item, pad_idx=self.source_dictionary.pad())
B, T = item.size()
# shift item one position over and append a padding token for the target
target = torch.nn.functional.pad(
item[:, 1:], (0, 1, 0, 0), value=self.target_dictionary.pad()
)
# fairseq expects batches to have the following structure
return {
"id": torch.tensor([id]*item.size(0)),
"net_input": {
"src_tokens": item,
},
"target": target,
"nsentences": item.size(0),
"ntokens": item.numel(),
}
def build_dataset_for_inference(
self, src_tokens: List[torch.Tensor], src_lengths: List[int], **kwargs
) -> torch.utils.data.Dataset:
eos = self.source_dictionary.eos()
dataset = TokenBlockDataset(
src_tokens,
src_lengths,
block_size=None, # ignored for "eos" break mode
pad=self.source_dictionary.pad(),
eos=eos,
break_mode="eos",
)
class Dataset(torch.utils.data.Dataset):
def __getitem__(self, i):
item = dataset[i]
if item[-1] == eos:
# remove eos to support generating with a prefix
item = item[:-1]
return (i, [item])
def __len__(self):
return len(dataset)
return Dataset()
def inference_step(
self, generator, models, sample, prefix_tokens=None, constraints=None
):
with torch.no_grad():
if constraints is not None:
raise NotImplementedError
# SequenceGenerator doesn't use *src_tokens* directly, we need to
# pass the *prefix_tokens* argument instead.
if prefix_tokens is None and sample["net_input"]["src_tokens"].nelement():
prefix_tokens = sample["net_input"]["src_tokens"]
# begin generation with the end-of-sentence token
bos_token = self.source_dictionary.eos()
return generator.generate(
models, sample, prefix_tokens=prefix_tokens, bos_token=bos_token
)
def eval_lm_dataloader(
self,
dataset,
max_tokens: Optional[int] = 36000,
batch_size: Optional[int] = None,
max_positions: Optional[int] = None,
num_shards: int = 1,
shard_id: int = 0,
num_workers: int = 1,
data_buffer_size: int = 10,
context_window: int = 0,
):
if context_window > 0:
raise NotImplementedError(
"Transformer-XL doesn't need --context-window, try "
"--model-overrides '{\"mem_len\":42}' instead "
)
return self.get_batch_iterator(
dataset=dataset,
max_tokens=max_tokens,
max_sentences=batch_size,
max_positions=max_positions,
ignore_invalid_inputs=True,
num_shards=num_shards,
shard_id=shard_id,
num_workers=num_workers,
data_buffer_size=data_buffer_size,
).next_epoch_itr(shuffle=False)
@property
def source_dictionary(self):
return self.dictionary
@property
def target_dictionary(self):
return self.dictionary
class TruncatedBPTTDataset(torch.utils.data.Dataset):
def __init__(
self,
data: List[torch.Tensor], # ordered list of items
bsz_per_shard, # number of items processed per GPUs per forward
shard_id, # current GPU ID
num_shards, # number of GPUs
):
super().__init__()
self.data = data
def batchify(data, bsz):
# Work out how cleanly we can divide the dataset into bsz parts.
nbatch = data.size(0) // bsz
# Trim off any extra elements that wouldn't cleanly fit (remainders).
data = data.narrow(0, 0, nbatch * bsz)
# Evenly divide the data across the bsz batches.
data = data.view(bsz, -1).contiguous()
return data
# total number of sequences processed by all GPUs in each forward pass
global_batch_size = bsz_per_shard * num_shards
"""
With a 16 item dataset, bsz_per_shard=2 and num_shards=3,
*indices* might look like:
indices = [[0, 1],
[2, 3],
[4, 5],
[6, 7],
[8, 9],
[10, 11]]
The size of the TruncatedBPTTDataset instance will be 2,
and shard 1 will see items:
[(0, [data[4], data[6]]),
(1, [data[5], data[7]])]
"""
indices = batchify(torch.arange(len(data)), global_batch_size)
assert indices.size(0) == global_batch_size
self.my_indices = indices[
shard_id * bsz_per_shard : (shard_id + 1) * bsz_per_shard
]
assert self.my_indices.size(0) == bsz_per_shard
def __len__(self):
return self.my_indices.size(1)
def __getitem__(self, i) -> Tuple[int, List[torch.Tensor]]:
return (i, [self.data[idx] for idx in self.my_indices[:, i]])