|
|
|
|
|
|
|
|
|
|
|
from fairseq import utils |
|
from fairseq.models import ( |
|
FairseqLanguageModel, |
|
register_model, |
|
register_model_architecture, |
|
) |
|
from fairseq.models.lightconv import Embedding, LightConvDecoder |
|
from fairseq.modules import AdaptiveInput, CharacterTokenEmbedder |
|
|
|
|
|
@register_model("lightconv_lm") |
|
class LightConvLanguageModel(FairseqLanguageModel): |
|
def __init__(self, decoder): |
|
super().__init__(decoder) |
|
|
|
@staticmethod |
|
def add_args(parser): |
|
"""Add model-specific arguments to the parser.""" |
|
parser.add_argument( |
|
"--dropout", |
|
default=0.1, |
|
type=float, |
|
metavar="D", |
|
help="dropout probability", |
|
) |
|
parser.add_argument( |
|
"--attention-dropout", |
|
default=0.0, |
|
type=float, |
|
metavar="D", |
|
help="dropout probability for attention weights", |
|
) |
|
parser.add_argument( |
|
"--relu-dropout", |
|
default=0.0, |
|
type=float, |
|
metavar="D", |
|
help="dropout probability after ReLU in FFN", |
|
) |
|
parser.add_argument( |
|
"--input-dropout", |
|
type=float, |
|
metavar="D", |
|
help="dropout probability of the inputs", |
|
) |
|
parser.add_argument( |
|
"--decoder-embed-dim", |
|
type=int, |
|
metavar="N", |
|
help="decoder embedding dimension", |
|
) |
|
parser.add_argument( |
|
"--decoder-output-dim", |
|
type=int, |
|
metavar="N", |
|
help="decoder output dimension", |
|
) |
|
parser.add_argument( |
|
"--decoder-input-dim", type=int, metavar="N", help="decoder input dimension" |
|
) |
|
parser.add_argument( |
|
"--decoder-ffn-embed-dim", |
|
type=int, |
|
metavar="N", |
|
help="decoder embedding dimension for FFN", |
|
) |
|
parser.add_argument( |
|
"--decoder-layers", type=int, metavar="N", help="num decoder layers" |
|
) |
|
parser.add_argument( |
|
"--decoder-attention-heads", |
|
type=int, |
|
metavar="N", |
|
help="num decoder attention heads or LightConv/DynamicConv heads", |
|
) |
|
parser.add_argument( |
|
"--decoder-normalize-before", |
|
default=False, |
|
action="store_true", |
|
help="apply layernorm before each decoder block", |
|
) |
|
parser.add_argument( |
|
"--adaptive-softmax-cutoff", |
|
metavar="EXPR", |
|
help="comma separated list of adaptive softmax cutoff points. " |
|
"Must be used with adaptive_loss criterion", |
|
) |
|
parser.add_argument( |
|
"--adaptive-softmax-dropout", |
|
type=float, |
|
metavar="D", |
|
help="sets adaptive softmax dropout for the tail projections", |
|
) |
|
parser.add_argument( |
|
"--adaptive-softmax-factor", |
|
type=float, |
|
metavar="N", |
|
help="adaptive input factor", |
|
) |
|
parser.add_argument( |
|
"--no-token-positional-embeddings", |
|
default=False, |
|
action="store_true", |
|
help="if set, disables positional embeddings (outside self attention)", |
|
) |
|
parser.add_argument( |
|
"--share-decoder-input-output-embed", |
|
default=False, |
|
action="store_true", |
|
help="share decoder input and output embeddings", |
|
) |
|
parser.add_argument( |
|
"--character-embeddings", |
|
default=False, |
|
action="store_true", |
|
help="if set, uses character embedding convolutions to produce token embeddings", |
|
) |
|
parser.add_argument( |
|
"--character-filters", |
|
type=str, |
|
metavar="LIST", |
|
default="[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]", |
|
help="size of character embeddings", |
|
) |
|
parser.add_argument( |
|
"--character-embedding-dim", |
|
type=int, |
|
metavar="N", |
|
default=4, |
|
help="size of character embeddings", |
|
) |
|
parser.add_argument( |
|
"--char-embedder-highway-layers", |
|
type=int, |
|
metavar="N", |
|
default=2, |
|
help="number of highway layers for character token embeddder", |
|
) |
|
parser.add_argument( |
|
"--adaptive-input", |
|
default=False, |
|
action="store_true", |
|
help="if set, uses adaptive input", |
|
) |
|
parser.add_argument( |
|
"--adaptive-input-factor", |
|
type=float, |
|
metavar="N", |
|
help="adaptive input factor", |
|
) |
|
parser.add_argument( |
|
"--adaptive-input-cutoff", |
|
metavar="EXPR", |
|
help="comma separated list of adaptive input cutoff points.", |
|
) |
|
parser.add_argument( |
|
"--tie-adaptive-weights", |
|
action="store_true", |
|
help="if set, ties the weights of adaptive softmax and adaptive input", |
|
) |
|
parser.add_argument( |
|
"--tie-adaptive-proj", |
|
action="store_true", |
|
help="if set, ties the projection weights of adaptive softmax and adaptive input", |
|
) |
|
parser.add_argument( |
|
"--decoder-learned-pos", |
|
action="store_true", |
|
help="use learned positional embeddings in the decoder", |
|
) |
|
|
|
"""LightConv and DynamicConv arguments""" |
|
parser.add_argument( |
|
"--decoder-kernel-size-list", |
|
type=lambda x: utils.eval_str_list(x, int), |
|
help='list of kernel size (default: "[3,7,15,31,31,31]")', |
|
) |
|
parser.add_argument( |
|
"--decoder-glu", type=utils.eval_bool, help="glu after in proj" |
|
) |
|
parser.add_argument( |
|
"--decoder-conv-type", |
|
default="dynamic", |
|
type=str, |
|
choices=["dynamic", "lightweight"], |
|
help="type of convolution", |
|
) |
|
parser.add_argument("--weight-softmax", default=True, type=utils.eval_bool) |
|
parser.add_argument( |
|
"--weight-dropout", |
|
type=float, |
|
metavar="D", |
|
help="dropout probability for conv weights", |
|
) |
|
|
|
@classmethod |
|
def build_model(cls, args, task): |
|
"""Build a new model instance.""" |
|
|
|
|
|
base_lm_architecture(args) |
|
|
|
if getattr(args, "max_source_positions", None) is None: |
|
args.max_source_positions = args.tokens_per_sample |
|
if getattr(args, "max_target_positions", None) is None: |
|
args.max_target_positions = args.tokens_per_sample |
|
|
|
if args.character_embeddings: |
|
embed_tokens = CharacterTokenEmbedder( |
|
task.dictionary, |
|
eval(args.character_filters), |
|
args.character_embedding_dim, |
|
args.decoder_embed_dim, |
|
args.char_embedder_highway_layers, |
|
) |
|
elif args.adaptive_input: |
|
embed_tokens = AdaptiveInput( |
|
len(task.dictionary), |
|
task.dictionary.pad(), |
|
args.decoder_input_dim, |
|
args.adaptive_input_factor, |
|
args.decoder_embed_dim, |
|
utils.eval_str_list(args.adaptive_input_cutoff, type=int), |
|
) |
|
else: |
|
embed_tokens = Embedding( |
|
len(task.dictionary), args.decoder_input_dim, task.dictionary.pad() |
|
) |
|
|
|
if args.tie_adaptive_weights: |
|
assert args.adaptive_input |
|
assert args.adaptive_input_factor == args.adaptive_softmax_factor |
|
assert ( |
|
args.adaptive_softmax_cutoff == args.adaptive_input_cutoff |
|
), "{} != {}".format( |
|
args.adaptive_softmax_cutoff, args.adaptive_input_cutoff |
|
) |
|
assert args.decoder_input_dim == args.decoder_output_dim |
|
|
|
decoder = LightConvDecoder( |
|
args, |
|
task.output_dictionary, |
|
embed_tokens, |
|
no_encoder_attn=True, |
|
final_norm=False, |
|
) |
|
return LightConvLanguageModel(decoder) |
|
|
|
|
|
@register_model_architecture("lightconv_lm", "lightconv_lm") |
|
def base_lm_architecture(args): |
|
args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) |
|
args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 2048) |
|
args.decoder_layers = getattr(args, "decoder_layers", 6) |
|
args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) |
|
args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) |
|
args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) |
|
args.adaptive_softmax_factor = getattr(args, "adaptive_softmax_factor", 4) |
|
args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) |
|
|
|
args.character_embeddings = getattr(args, "character_embeddings", False) |
|
|
|
args.decoder_output_dim = getattr( |
|
args, "decoder_output_dim", args.decoder_embed_dim |
|
) |
|
args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) |
|
args.decoder_conv_dim = getattr(args, "decoder_conv_dim", args.decoder_embed_dim) |
|
|
|
|
|
args.decoder_normalize_before = True |
|
|
|
args.adaptive_input = getattr(args, "adaptive_input", False) |
|
args.adaptive_input_factor = getattr(args, "adaptive_input_factor", 4) |
|
args.adaptive_input_cutoff = getattr(args, "adaptive_input_cutoff", None) |
|
|
|
args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False) |
|
args.tie_adaptive_proj = getattr(args, "tie_adaptive_proj", False) |
|
|
|
args.decoder_kernel_size_list = getattr( |
|
args, "decoder_kernel_size_list", [3, 7, 15, 31, 31, 31] |
|
) |
|
if len(args.decoder_kernel_size_list) == 1: |
|
args.decoder_kernel_size_list = ( |
|
args.decoder_kernel_size_list * args.decoder_layers |
|
) |
|
assert ( |
|
len(args.decoder_kernel_size_list) == args.decoder_layers |
|
), "decoder_kernel_size_list doesn't match decoder_layers" |
|
args.decoder_glu = getattr(args, "decoder_glu", True) |
|
args.input_dropout = getattr(args, "input_dropout", 0.1) |
|
args.weight_dropout = getattr(args, "weight_dropout", args.attention_dropout) |
|
|
|
|
|
@register_model_architecture("lightconv_lm", "lightconv_lm_gbw") |
|
def lightconv_lm_gbw(args): |
|
args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) |
|
args.dropout = getattr(args, "dropout", 0.1) |
|
args.attention_dropout = getattr(args, "attention_dropout", 0.1) |
|
args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096) |
|
args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) |
|
base_lm_architecture(args) |
|
|