t5mimo-seq2seq / modeling_t5mimo.py

Upload T5MIMOForConditionalGeneration

b679034 verified 3 months ago

76.4 kB

	import copy
	import math
	import warnings
	from typing import Optional, Tuple, Union
	import torch
	from torch import nn
	from torch.nn import CrossEntropyLoss
	from transformers.activations import ACT2FN
	from transformers.modeling_outputs import (
	BaseModelOutput,
	BaseModelOutputWithPastAndCrossAttentions,
	Seq2SeqLMOutput,
	Seq2SeqModelOutput,
	)
	from transformers.modeling_utils import PreTrainedModel
	from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
	from transformers.utils import (
	DUMMY_INPUTS,
	DUMMY_MASK,
	is_torch_fx_proxy,
	logging,
	)
	from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
	from .configuration_t5mimo import T5MIMOConfig


	logger = logging.get_logger(__name__)



	class T5LayerNorm(nn.Module):
	def __init__(self, hidden_size, eps=1e-6):
	"""
	Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
	"""
	super().__init__()
	self.weight = nn.Parameter(torch.ones(hidden_size))
	self.variance_epsilon = eps

	def forward(self, hidden_states):
	# T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
	# Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
	# w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
	# half-precision inputs is done in fp32

	variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
	hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)

	# convert into half-precision if necessary
	if self.weight.dtype in [torch.float16, torch.bfloat16]:
	hidden_states = hidden_states.to(self.weight.dtype)

	return self.weight * hidden_states


	ALL_LAYERNORM_LAYERS.append(T5LayerNorm)


	class T5DenseActDense(nn.Module):
	def __init__(self, config: T5MIMOConfig):
	super().__init__()
	self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
	self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
	self.dropout = nn.Dropout(config.dropout_rate)
	self.act = ACT2FN[config.dense_act_fn]

	def forward(self, hidden_states):
	hidden_states = self.wi(hidden_states)
	hidden_states = self.act(hidden_states)
	hidden_states = self.dropout(hidden_states)
	if (
	isinstance(self.wo.weight, torch.Tensor)
	and hidden_states.dtype != self.wo.weight.dtype
	and self.wo.weight.dtype != torch.int8
	):
	hidden_states = hidden_states.to(self.wo.weight.dtype)
	hidden_states = self.wo(hidden_states)
	return hidden_states


	class T5DenseGatedActDense(nn.Module):
	def __init__(self, config: T5MIMOConfig):
	super().__init__()
	self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
	self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
	self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
	self.dropout = nn.Dropout(config.dropout_rate)
	self.act = ACT2FN[config.dense_act_fn]

	def forward(self, hidden_states):
	hidden_gelu = self.act(self.wi_0(hidden_states))
	hidden_linear = self.wi_1(hidden_states)
	hidden_states = hidden_gelu * hidden_linear
	hidden_states = self.dropout(hidden_states)

	# To make 8bit quantization work for google/flan-t5-xxl, self.wo is kept in float32.
	# See https://github.com/huggingface/transformers/issues/20287
	# we also make sure the weights are not in `int8` in case users will force `_keep_in_fp32_modules` to be `None``
	if (
	isinstance(self.wo.weight, torch.Tensor)
	and hidden_states.dtype != self.wo.weight.dtype
	and self.wo.weight.dtype != torch.int8
	):
	hidden_states = hidden_states.to(self.wo.weight.dtype)

	hidden_states = self.wo(hidden_states)
	return hidden_states


	class T5LayerFF(nn.Module):
	def __init__(self, config: T5MIMOConfig):
	super().__init__()
	if config.is_gated_act:
	self.DenseReluDense = T5DenseGatedActDense(config)
	else:
	self.DenseReluDense = T5DenseActDense(config)

	self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
	self.dropout = nn.Dropout(config.dropout_rate)

	def forward(self, hidden_states):
	forwarded_states = self.layer_norm(hidden_states)
	forwarded_states = self.DenseReluDense(forwarded_states)
	hidden_states = hidden_states + self.dropout(forwarded_states)
	return hidden_states



	class MultivariateConvBlock(nn.Module):
	def __init__(self, config: T5MIMOConfig, kernel_size=3, stride=1, padding=1):
	super().__init__()
	# 2D Convolution across sequences and time
	self.conv1 = nn.Conv2d(
	in_channels=config.num_seqs,
	out_channels=config.num_filters,
	kernel_size=kernel_size, # Kernel spans across time and all features
	stride=1, # Stride across time, no stride across features
	padding=1 # Padding to preserve sequence length, no padding across features
	)

	# Batch normalization for stabilization and faster convergence
	self.bn1 = nn.BatchNorm2d(config.num_filters)

	# Second convolution layer to further model interactions and temporal patterns
	self.conv2 = nn.Conv2d(
	in_channels=config.num_filters,
	out_channels=config.num_filters,
	kernel_size=(kernel_size, 1), # Focus only on temporal patterns
	stride=(stride, 1),
	padding=(padding, 0)
	)

	# Batch normalization after second convolution
	self.bn2 = nn.BatchNorm2d(config.num_filters)

	# 1x1 Convolution to reduce the channel dimension back to num_seqs
	self.conv3 = nn.Conv2d(
	in_channels=config.num_filters,
	out_channels=config.num_seqs, # Back to the original number of sequences (channels)
	kernel_size=(1, 1)
	)

	def forward(self, x):
	"""
	Forward pass of the multivariate convolutional block.

	Args:
	x (torch.Tensor): Input tensor of shape [batch_size, num_seqs, seq_len, model_dim].

	Returns:
	torch.Tensor: Output tensor of shape [batch_size, num_seqs, seq_len, model_dim].
	"""
	# Permute to [batch_size, num_seqs, seq_len, model_dim] -> [batch_size, num_seqs, model_dim, seq_len]
	x = x.permute(0, 1, 3, 2)

	# Apply first convolution and activation
	x = nn.functional.relu(self.bn1(self.conv1(x)))
	# Apply second convolution and activation
	x = nn.functional.relu(self.bn2(self.conv2(x)))

	# Reduce channel dimension back to num_seqs
	x = self.conv3(x)

	# Permute back to original shape [batch_size, num_seqs, seq_len, model_dim]
	x = x.permute(0, 1, 3, 2)

	return x



	class T5Attention(nn.Module):
	def __init__(self, config: T5MIMOConfig, has_relative_attention_bias=False):
	super().__init__()
	self.is_decoder = config.is_decoder
	self.has_relative_attention_bias = has_relative_attention_bias
	self.relative_attention_num_buckets = config.relative_attention_num_buckets
	self.relative_attention_max_distance = config.relative_attention_max_distance
	self.d_model = config.d_model
	self.key_value_proj_dim = config.d_kv
	self.n_heads = config.num_heads
	self.dropout = config.dropout_rate
	self.inner_dim = self.n_heads * self.key_value_proj_dim

	# Mesh TensorFlow initialization to avoid scaling before softmax
	self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
	self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
	self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
	self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)

	if self.has_relative_attention_bias:
	self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
	self.pruned_heads = set()
	self.gradient_checkpointing = False

	def prune_heads(self, heads):
	if len(heads) == 0:
	return
	heads, index = find_pruneable_heads_and_indices(
	heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads
	)
	# Prune linear layers
	self.q = prune_linear_layer(self.q, index)
	self.k = prune_linear_layer(self.k, index)
	self.v = prune_linear_layer(self.v, index)
	self.o = prune_linear_layer(self.o, index, dim=1)
	# Update hyper params
	self.n_heads = self.n_heads - len(heads)
	self.inner_dim = self.key_value_proj_dim * self.n_heads
	self.pruned_heads = self.pruned_heads.union(heads)

	@staticmethod
	def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
	"""
	Adapted from Mesh Tensorflow:
	https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

	Translate relative position to a bucket number for relative attention. The relative position is defined as
	memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
	position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
	small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
	positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
	This should allow for more graceful generalization to longer sequences than the model has been trained on

	Args:
	relative_position: an int32 Tensor
	bidirectional: a boolean - whether the attention is bidirectional
	num_buckets: an integer
	max_distance: an integer

	Returns:
	a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
	"""
	relative_buckets = 0
	if bidirectional:
	num_buckets //= 2
	relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
	relative_position = torch.abs(relative_position)
	else:
	relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
	# now relative_position is in the range [0, inf)

	# half of the buckets are for exact increments in positions
	max_exact = num_buckets // 2
	is_small = relative_position < max_exact

	# The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
	relative_position_if_large = max_exact + (
	torch.log(relative_position.float() / max_exact)
	/ math.log(max_distance / max_exact)
	* (num_buckets - max_exact)
	).to(torch.long)
	relative_position_if_large = torch.min(
	relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
	)

	relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
	return relative_buckets

	def compute_bias(self, query_length, key_length,multivar_dim=-1, device=None):
	"""Compute binned relative position bias"""
	if device is None:
	device = self.relative_attention_bias.weight.device
	context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
	memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
	relative_position = memory_position - context_position # shape (query_length, key_length)
	relative_position_bucket = self._relative_position_bucket(
	relative_position, # shape (query_length, key_length)
	bidirectional=(not self.is_decoder),
	num_buckets=self.relative_attention_num_buckets,
	max_distance=self.relative_attention_max_distance,
	)
	values = self.relative_attention_bias(relative_position_bucket) # shape (query_length, key_length, num_heads)
	values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, query_length, key_length)
	if multivar_dim !=-1: # shape (1, multivar_dim, num_heads, query_length, key_length) (copy across)
	values = values.expand(1, multivar_dim, -1, -1, -1)

	return values

	def forward(
	self,
	hidden_states,
	mask=None,
	key_value_states=None,
	position_bias=None,
	past_key_value=None,
	layer_head_mask=None,
	query_length=None,
	use_cache=False,
	output_attentions=False,
	):
	"""
	Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
	"""
	# Input is (batch_size, seq_length, dim)
	# Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
	# past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
	if len(hidden_states.shape) == 3:
	batch_size, seq_length = hidden_states.shape[:2]
	else:
	batch_size, seq_length = hidden_states.shape[0],hidden_states.shape[2]
	multivar_dim = hidden_states.shape[1]
	real_seq_length = seq_length

	if past_key_value is not None:
	if len(past_key_value) != 2:
	raise ValueError(
	f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
	)
	real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length

	if len(hidden_states.shape) == 3:
	key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
	else:
	key_length = real_seq_length if key_value_states is None else key_value_states.shape[2]


	def shape(states):
	"""projection"""
	# states: torch.Size([3, 16, 512]) -> query_states: torch.Size([3, 8, 16, 64])
	# states: torch.Size([3, 6, 16, 512]) -> query_states: torch.Size([3, 6, 8 , 16, 64])
	if len(states.shape) == 3:
	return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
	else:
	return states.view(batch_size, multivar_dim, -1, self.n_heads, self.key_value_proj_dim).transpose(2, 3)


	def unshape(states):
	"""reshape"""
	if len(states.shape) == 4:
	return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
	else:
	return states.transpose(2, 3).contiguous().view(batch_size, multivar_dim, -1, self.inner_dim)

	def project(hidden_states, proj_layer, key_value_states, past_key_value):
	"""projects hidden states correctly to key/query states"""
	if key_value_states is None:
	# self-attn
	# (batch_size, n_heads, seq_length, dim_per_head)
	hidden_states = shape(proj_layer(hidden_states))
	elif past_key_value is None:
	# cross-attn
	# (batch_size, n_heads, seq_length, dim_per_head)
	hidden_states = shape(proj_layer(key_value_states))

	if past_key_value is not None:
	if key_value_states is None:
	# self-attn
	# (batch_size, n_heads, key_length, dim_per_head)
	hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
	elif past_key_value.shape[2] != key_value_states.shape[1]:
	# checking that the `sequence_length` of the `past_key_value` is the same as
	# the provided `key_value_states` to support prefix tuning
	# cross-attn
	# (batch_size, n_heads, seq_length, dim_per_head)
	hidden_states = shape(proj_layer(key_value_states))
	else:
	# cross-attn
	hidden_states = past_key_value
	return hidden_states

	# get query states
	query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head)


	# get key/value states
	key_states = project(
	hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None
	)
	value_states = project(
	hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None
	)



	# compute scores
	if len(hidden_states.shape) == 3:
	scores = torch.matmul(
	query_states, key_states.transpose(3, 2)
	) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
	else:
	scores = torch.matmul(
	query_states, key_states.transpose(4, 3)
	)





	if position_bias is None:
	if not self.has_relative_attention_bias:

	if len(hidden_states.shape) == 3:
	position_bias = torch.zeros(
	(1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
	)
	else:
	position_bias = torch.zeros(
	(1,multivar_dim, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
	)
	if self.gradient_checkpointing and self.training:
	position_bias.requires_grad = True
	else:

	if len(hidden_states.shape) == 3:
	position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
	else:
	position_bias = self.compute_bias(real_seq_length, key_length,multivar_dim=multivar_dim, device=scores.device)

	# if key and values are already calculated
	# we want only the last query position bias
	if past_key_value is not None:
	position_bias = position_bias[:, :, -hidden_states.size(1) :, :]

	if mask is not None:
	position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length)



	if self.pruned_heads:
	mask = torch.ones(position_bias.shape[1])
	mask[list(self.pruned_heads)] = 0
	position_bias_masked = position_bias[:, mask.bool()]
	else:
	position_bias_masked = position_bias


	scores += position_bias_masked
	attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
	scores
	) # (batch_size, n_heads, seq_length, key_length)
	attn_weights = nn.functional.dropout(
	attn_weights, p=self.dropout, training=self.training
	) # (batch_size, n_heads, seq_length, key_length)

	# Mask heads if we want to
	if layer_head_mask is not None:
	attn_weights = attn_weights * layer_head_mask


	if len(hidden_states.shape) == 3:
	attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim)
	else:
	attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, multivar_dim, seq_length, dim)
	attn_output = self.o(attn_output)


	present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
	outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)


	if output_attentions:
	outputs = outputs + (attn_weights,)

	return outputs


	class T5LayerSelfAttention(nn.Module):
	def __init__(self, config, has_relative_attention_bias=False):
	super().__init__()
	self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
	self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
	self.dropout = nn.Dropout(config.dropout_rate)

	def forward(
	self,
	hidden_states,
	attention_mask=None,
	position_bias=None,
	layer_head_mask=None,
	past_key_value=None,
	use_cache=False,
	output_attentions=False,
	):
	normed_hidden_states = self.layer_norm(hidden_states)
	attention_output = self.SelfAttention(
	normed_hidden_states,
	mask=attention_mask,
	position_bias=position_bias,
	layer_head_mask=layer_head_mask,
	past_key_value=past_key_value,
	use_cache=use_cache,
	output_attentions=output_attentions,
	)

	hidden_states = hidden_states + self.dropout(attention_output[0])
	outputs = (hidden_states,) + attention_output[1:] # add attentions if we output them
	return outputs


	class T5LayerCrossAttention(nn.Module):
	def __init__(self, config):
	super().__init__()
	self.EncDecAttention = T5Attention(config, has_relative_attention_bias=False)
	self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
	self.dropout = nn.Dropout(config.dropout_rate)

	def forward(
	self,
	hidden_states,
	key_value_states,
	attention_mask=None,
	position_bias=None,
	layer_head_mask=None,
	past_key_value=None,
	use_cache=False,
	query_length=None,
	output_attentions=False,
	):

	normed_hidden_states = self.layer_norm(hidden_states)
	attention_output = self.EncDecAttention(
	normed_hidden_states,
	mask=attention_mask,
	key_value_states=key_value_states,
	position_bias=position_bias,
	layer_head_mask=layer_head_mask,
	past_key_value=past_key_value,
	use_cache=use_cache,
	query_length=query_length,
	output_attentions=output_attentions,
	)
	layer_output = hidden_states + self.dropout(attention_output[0])
	outputs = (layer_output,) + attention_output[1:] # add attentions if we output them
	return outputs


	class T5Block(nn.Module):
	def __init__(self, config, has_relative_attention_bias=False):
	super().__init__()
	self.is_decoder = config.is_decoder
	self.layer = nn.ModuleList()
	self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias))
	if self.is_decoder:
	self.layer.append(T5LayerCrossAttention(config))

	self.layer.append(T5LayerFF(config))

	def forward(
	self,
	hidden_states,
	attention_mask=None,
	position_bias=None,
	encoder_hidden_states=None,
	encoder_attention_mask=None,
	encoder_decoder_position_bias=None,
	layer_head_mask=None,
	cross_attn_layer_head_mask=None,
	past_key_value=None,
	use_cache=False,
	output_attentions=False,
	return_dict=True,
	):
	if past_key_value is not None:
	if not self.is_decoder:
	logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.")
	expected_num_past_key_values = 2 if encoder_hidden_states is None else 4

	if len(past_key_value) != expected_num_past_key_values:
	raise ValueError(
	f"There should be {expected_num_past_key_values} past states. "
	f"{'2 (key / value) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
	f"Got {len(past_key_value)} past key / value states"
	)

	self_attn_past_key_value = past_key_value[:2]
	cross_attn_past_key_value = past_key_value[2:]
	else:
	self_attn_past_key_value, cross_attn_past_key_value = None, None

	self_attention_outputs = self.layer[0](
	hidden_states,
	attention_mask=attention_mask,
	position_bias=position_bias,
	layer_head_mask=layer_head_mask,
	past_key_value=self_attn_past_key_value,
	use_cache=use_cache,
	output_attentions=output_attentions,
	)
	hidden_states, present_key_value_state = self_attention_outputs[:2]
	attention_outputs = self_attention_outputs[2:] # Keep self-attention outputs and relative position weights

	# clamp inf values to enable fp16 training
	if hidden_states.dtype == torch.float16:
	clamp_value = torch.where(
	torch.isinf(hidden_states).any(),
	torch.finfo(hidden_states.dtype).max - 1000,
	torch.finfo(hidden_states.dtype).max,
	)
	hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)

	do_cross_attention = self.is_decoder and encoder_hidden_states is not None
	if do_cross_attention:
	# the actual query length is unknown for cross attention
	# if using past key value states. Need to inject it here
	if present_key_value_state is not None:
	query_length = present_key_value_state[0].shape[2]
	else:
	query_length = None

	cross_attention_outputs = self.layer[1](
	hidden_states,
	key_value_states=encoder_hidden_states,
	attention_mask=encoder_attention_mask,
	position_bias=encoder_decoder_position_bias,
	layer_head_mask=cross_attn_layer_head_mask,
	past_key_value=cross_attn_past_key_value,
	query_length=query_length,
	use_cache=use_cache,
	output_attentions=output_attentions,
	)
	hidden_states = cross_attention_outputs[0]

	# clamp inf values to enable fp16 training
	if hidden_states.dtype == torch.float16:
	clamp_value = torch.where(
	torch.isinf(hidden_states).any(),
	torch.finfo(hidden_states.dtype).max - 1000,
	torch.finfo(hidden_states.dtype).max,
	)
	hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)

	# Combine self attn and cross attn key value states
	if present_key_value_state is not None:
	present_key_value_state = present_key_value_state + cross_attention_outputs[1]

	# Keep cross-attention outputs and relative position weights
	attention_outputs = attention_outputs + cross_attention_outputs[2:]

	# Apply Feed Forward layer
	hidden_states = self.layer[-1](hidden_states)

	# clamp inf values to enable fp16 training
	if hidden_states.dtype == torch.float16:
	clamp_value = torch.where(
	torch.isinf(hidden_states).any(),
	torch.finfo(hidden_states.dtype).max - 1000,
	torch.finfo(hidden_states.dtype).max,
	)
	hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)

	outputs = (hidden_states,)

	if use_cache:
	outputs = outputs + (present_key_value_state,) + attention_outputs
	else:
	outputs = outputs + attention_outputs

	return outputs # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)


	class T5ClassificationHead(nn.Module):
	"""Head for sentence-level classification tasks."""

	def __init__(self, config: T5MIMOConfig):
	super().__init__()
	self.dense = nn.Linear(config.d_model, config.d_model)
	self.dropout = nn.Dropout(p=config.classifier_dropout)
	self.out_proj = nn.Linear(config.d_model, config.num_labels)

	def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
	hidden_states = self.dropout(hidden_states)
	hidden_states = self.dense(hidden_states)
	hidden_states = torch.tanh(hidden_states)
	hidden_states = self.dropout(hidden_states)
	hidden_states = self.out_proj(hidden_states)
	return hidden_states


	class T5PreTrainedModel(PreTrainedModel):
	"""
	An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
	models.
	"""

	config_class = T5MIMOConfig
	base_model_prefix = "transformer"
	is_parallelizable = True
	supports_gradient_checkpointing = True
	_no_split_modules = ["T5Block"]
	_keep_in_fp32_modules = ["wo"]

	@property
	def dummy_inputs(self):
	input_ids = torch.tensor(DUMMY_INPUTS)
	input_mask = torch.tensor(DUMMY_MASK)
	dummy_inputs = {
	"decoder_input_ids": input_ids,
	"input_ids": input_ids,
	"decoder_attention_mask": input_mask,
	}
	return dummy_inputs

	def _init_weights(self, module):
	"""Initialize the weights"""
	factor = self.config.initializer_factor # Used for testing weights initialization
	if isinstance(module, T5LayerNorm):
	module.weight.data.fill_(factor * 1.0)
	elif isinstance(
	module,
	(T5MIMOModel, T5MIMOForConditionalGeneration, T5MIMOEncoderModel),
	):
	# Mesh TensorFlow embeddings initialization
	# See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
	module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
	if hasattr(module, "lm_head") and not self.config.tie_word_embeddings:
	module.lm_head.weight.data.normal_(mean=0.0, std=factor * 1.0)
	if hasattr(module, "qa_outputs"):
	module.qa_outputs.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
	module.qa_outputs.bias.data.zero_()
	elif isinstance(module, T5ClassificationHead):
	module.dense.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
	if hasattr(module.dense, "bias") and module.dense.bias is not None:
	module.dense.bias.data.zero_()
	module.out_proj.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
	if hasattr(module.out_proj, "bias") and module.out_proj.bias is not None:
	module.out_proj.bias.data.zero_()
	elif isinstance(module, T5DenseActDense):
	# Mesh TensorFlow FF initialization
	# See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
	# and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
	module.wi.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
	if hasattr(module.wi, "bias") and module.wi.bias is not None:
	module.wi.bias.data.zero_()
	module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
	if hasattr(module.wo, "bias") and module.wo.bias is not None:
	module.wo.bias.data.zero_()
	elif isinstance(module, T5DenseGatedActDense):
	module.wi_0.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
	if hasattr(module.wi_0, "bias") and module.wi_0.bias is not None:
	module.wi_0.bias.data.zero_()
	module.wi_1.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
	if hasattr(module.wi_1, "bias") and module.wi_1.bias is not None:
	module.wi_1.bias.data.zero_()
	module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
	if hasattr(module.wo, "bias") and module.wo.bias is not None:
	module.wo.bias.data.zero_()
	elif isinstance(module, T5Attention):
	# Mesh TensorFlow attention initialization to avoid scaling before softmax
	# See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
	d_model = self.config.d_model
	key_value_proj_dim = self.config.d_kv
	n_heads = self.config.num_heads
	module.q.weight.data.normal_(mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5))
	module.k.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
	module.v.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
	module.o.weight.data.normal_(mean=0.0, std=factor * ((n_heads * key_value_proj_dim) ** -0.5))
	if module.has_relative_attention_bias:
	module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5))

	def _shift_right(self, input_ids):
	decoder_start_token_id = self.config.decoder_start_token_id
	pad_token_id = self.config.pad_token_id

	if decoder_start_token_id is None:
	raise ValueError(
	"self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id. "
	"See T5 docs for more information."
	)

	# shift inputs to the right
	if is_torch_fx_proxy(input_ids):
	# Item assignment is not supported natively for proxies.
	shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id)
	shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
	else:
	shifted_input_ids = input_ids.new_zeros(input_ids.shape)
	shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
	shifted_input_ids[..., 0] = decoder_start_token_id

	if pad_token_id is None:
	raise ValueError("self.model.config.pad_token_id has to be defined.")
	# replace possible -100 values in labels by `pad_token_id`
	shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)

	return shifted_input_ids


	class T5Stack(T5PreTrainedModel):
	def __init__(self, config, embed_tokens=None):
	super().__init__(config)

	self.embed_tokens = embed_tokens
	self.is_decoder = config.is_decoder

	self.block = nn.ModuleList(
	[T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
	)
	self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
	self.dropout = nn.Dropout(config.dropout_rate)

	# Initialize weights and apply final processing
	self.post_init()
	# Model parallel
	self.model_parallel = False
	self.device_map = None
	self.gradient_checkpointing = False

	def parallelize(self, device_map=None):
	warnings.warn(
	"`T5Stack.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model"
	" with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
	" `device_map` but it needs to be a dictionary module_name to device, so for instance {'block.0': 0,"
	" 'block.1': 1, ...}",
	FutureWarning,
	)
	# Check validity of device_map
	self.device_map = (
	get_device_map(len(self.block), range(torch.cuda.device_count())) if device_map is None else device_map
	)
	assert_device_map(self.device_map, len(self.block))
	self.model_parallel = True
	self.first_device = "cpu" if "cpu" in self.device_map.keys() else "cuda:" + str(min(self.device_map.keys()))
	self.last_device = "cuda:" + str(max(self.device_map.keys()))
	# Load onto devices
	for k, v in self.device_map.items():
	for layer in v:
	cuda_device = "cuda:" + str(k)
	self.block[layer] = self.block[layer].to(cuda_device)

	# Set embed_tokens to first layer
	self.embed_tokens = self.embed_tokens.to(self.first_device)
	# Set final layer norm to last device
	self.final_layer_norm = self.final_layer_norm.to(self.last_device)


	def deparallelize(self):
	warnings.warn(
	"Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
	FutureWarning,
	)
	self.model_parallel = False
	self.device_map = None
	self.first_device = "cpu"
	self.last_device = "cpu"
	for i in range(len(self.block)):
	self.block[i] = self.block[i].to("cpu")
	self.embed_tokens = self.embed_tokens.to("cpu")
	self.final_layer_norm = self.final_layer_norm.to("cpu")
	torch.cuda.empty_cache()

	def get_input_embeddings(self):
	return self.embed_tokens

	def set_input_embeddings(self, new_embeddings):
	self.embed_tokens = new_embeddings

	def forward(
	self,
	input_ids=None,
	attention_mask=None,
	encoder_hidden_states=None,
	encoder_attention_mask=None,
	inputs_embeds=None,
	head_mask=None,
	cross_attn_head_mask=None,
	past_key_values=None,
	use_cache=None,
	output_attentions=None,
	output_hidden_states=None,
	return_dict=None,
	):
	# Model parallel
	if self.model_parallel:
	torch.cuda.set_device(self.first_device)
	self.embed_tokens = self.embed_tokens.to(self.first_device)
	use_cache = use_cache if use_cache is not None else self.config.use_cache
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	if input_ids is not None and inputs_embeds is not None:
	err_msg_prefix = "decoder_" if self.is_decoder else ""
	raise ValueError(
	f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
	)
	elif input_ids is not None:
	input_shape = input_ids.size()
	# input_ids = input_ids.view(-1, input_shape[-1])
	elif inputs_embeds is not None:
	input_shape = inputs_embeds.size()[:-1]
	else:
	err_msg_prefix = "decoder_" if self.is_decoder else ""
	raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds")

	if inputs_embeds is None:
	if self.embed_tokens is None:
	raise ValueError("You have to initialize the model with valid token embeddings")
	inputs_embeds = self.embed_tokens(input_ids)

	if len(input_shape) == 3:
	batch_size, multivar_seqs ,seq_length = input_shape
	else:
	batch_size, seq_length = input_shape

	# required mask seq length can be calculated via length of past
	mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length

	if use_cache is True:
	if not self.is_decoder:
	raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder")

	# initialize past_key_values with `None` if past does not exist
	if past_key_values is None:
	past_key_values = [None] * len(self.block)

	if attention_mask is None:
	if len(input_shape) == 2:
	attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
	else:
	attention_mask = torch.ones(batch_size, multivar_seqs, mask_seq_length, device=inputs_embeds.device)



	# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
	# ourselves in which case we just need to make it broadcastable to all heads.
	if len(input_shape) == 2:
	extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
	else:
	extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
	# permute from [batch_size, 1, multivar_seqs, seq_length] to [batch_size, multivar_seqs, 1, seq_length]
	extended_attention_mask = extended_attention_mask.permute(0, 2, 1, 3)
	# Now make it [batch_size, multivar_seqs, 1, 1, seq_length]
	extended_attention_mask = extended_attention_mask.unsqueeze(3)

	# If a 2D or 3D attention mask is provided for the cross-attention
	# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
	if self.is_decoder and encoder_hidden_states is not None:

	if len(encoder_hidden_states.size()) == 3 :
	encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
	else:
	encoder_batch_size, multivar_dem, encoder_sequence_length, _ = encoder_hidden_states.size()

	encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
	if encoder_attention_mask is None:
	encoder_attention_mask = torch.ones(
	encoder_hidden_shape, device=inputs_embeds.device, dtype=torch.long
	)
	if len(input_shape) == 2:
	encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
	else:
	encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
	multivar_dim = extended_attention_mask.shape[1]
	encoder_extended_attention_mask = encoder_extended_attention_mask.unsqueeze(1)
	encoder_extended_attention_mask = encoder_extended_attention_mask.permute(0, 3, 1, 2, 4)

	else:
	encoder_extended_attention_mask = None



	if self.gradient_checkpointing and self.training:
	if use_cache:
	logger.warning_once(
	"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
	)
	use_cache = False

	# Prepare head mask if needed
	head_mask = self.get_head_mask(head_mask, self.config.num_layers)
	cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
	present_key_value_states = () if use_cache else None
	all_hidden_states = () if output_hidden_states else None
	all_attentions = () if output_attentions else None
	all_cross_attentions = () if (output_attentions and self.is_decoder) else None
	position_bias = None
	encoder_decoder_position_bias = None

	hidden_states = self.dropout(inputs_embeds)

	for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
	layer_head_mask = head_mask[i]
	cross_attn_layer_head_mask = cross_attn_head_mask[i]
	# Model parallel
	if self.model_parallel:
	torch.cuda.set_device(hidden_states.device)
	# Ensure that attention_mask is always on the same device as hidden_states
	if attention_mask is not None:
	attention_mask = attention_mask.to(hidden_states.device)
	if position_bias is not None:
	position_bias = position_bias.to(hidden_states.device)
	if encoder_hidden_states is not None:
	encoder_hidden_states = encoder_hidden_states.to(hidden_states.device)
	if encoder_extended_attention_mask is not None:
	encoder_extended_attention_mask = encoder_extended_attention_mask.to(hidden_states.device)
	if encoder_decoder_position_bias is not None:
	encoder_decoder_position_bias = encoder_decoder_position_bias.to(hidden_states.device)
	if layer_head_mask is not None:
	layer_head_mask = layer_head_mask.to(hidden_states.device)
	if cross_attn_layer_head_mask is not None:
	cross_attn_layer_head_mask = cross_attn_layer_head_mask.to(hidden_states.device)
	if output_hidden_states:
	all_hidden_states = all_hidden_states + (hidden_states,)

	if self.gradient_checkpointing and self.training:
	layer_outputs = self._gradient_checkpointing_func(
	layer_module.forward,
	hidden_states,
	extended_attention_mask,
	position_bias,
	encoder_hidden_states,
	encoder_extended_attention_mask,
	encoder_decoder_position_bias,
	layer_head_mask,
	cross_attn_layer_head_mask,
	None, # past_key_value is always None with gradient checkpointing
	use_cache,
	output_attentions,
	)
	else:
	layer_outputs = layer_module(
	hidden_states,
	attention_mask=extended_attention_mask,
	position_bias=position_bias,
	encoder_hidden_states=encoder_hidden_states,
	encoder_attention_mask=encoder_extended_attention_mask,
	encoder_decoder_position_bias=encoder_decoder_position_bias,
	layer_head_mask=layer_head_mask,
	cross_attn_layer_head_mask=cross_attn_layer_head_mask,
	past_key_value=past_key_value,
	use_cache=use_cache,
	output_attentions=output_attentions,
	)

	# layer_outputs is a tuple with:
	# hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
	if use_cache is False:
	layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:]

	hidden_states, present_key_value_state = layer_outputs[:2]

	# We share the position biases between the layers - the first layer store them
	# layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
	# (cross-attention position bias), (cross-attention weights)
	position_bias = layer_outputs[2]
	if self.is_decoder and encoder_hidden_states is not None:
	encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3]
	# append next layer key value states
	if use_cache:
	present_key_value_states = present_key_value_states + (present_key_value_state,)

	if output_attentions:
	all_attentions = all_attentions + (layer_outputs[3],)
	if self.is_decoder:
	all_cross_attentions = all_cross_attentions + (layer_outputs[5],)

	# Model Parallel: If it's the last layer for that device, put things on the next device
	if self.model_parallel:
	for k, v in self.device_map.items():
	if i == v[-1] and "cuda:" + str(k) != self.last_device:
	hidden_states = hidden_states.to("cuda:" + str(k + 1))

	hidden_states = self.final_layer_norm(hidden_states)
	hidden_states = self.dropout(hidden_states)

	# Add last layer
	if output_hidden_states:
	all_hidden_states = all_hidden_states + (hidden_states,)

	if not return_dict:
	return tuple(
	v
	for v in [
	hidden_states,
	present_key_value_states,
	all_hidden_states,
	all_attentions,
	all_cross_attentions,
	]
	if v is not None
	)
	return BaseModelOutputWithPastAndCrossAttentions(
	last_hidden_state=hidden_states,
	past_key_values=present_key_value_states,
	hidden_states=all_hidden_states,
	attentions=all_attentions,
	cross_attentions=all_cross_attentions,
	)



	class T5MIMOModel(T5PreTrainedModel):
	_keys_to_ignore_on_load_unexpected = [
	"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
	]
	_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]

	def __init__(self, config: T5MIMOConfig):
	super().__init__(config)
	self.shared = nn.Embedding(config.vocab_size, config.d_model)

	encoder_config = copy.deepcopy(config)
	encoder_config.is_decoder = False
	encoder_config.use_cache = False
	encoder_config.is_encoder_decoder = False
	self.encoder = T5Stack(encoder_config, self.shared)

	decoder_config = copy.deepcopy(config)
	decoder_config.is_decoder = True
	decoder_config.is_encoder_decoder = False
	decoder_config.num_layers = config.num_decoder_layers
	self.decoder = T5Stack(decoder_config, self.shared)

	# Initialize weights and apply final processing
	self.post_init()

	# Model parallel
	self.model_parallel = False
	self.device_map = None


	def parallelize(self, device_map=None):
	warnings.warn(
	"`T5Model.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model"
	" with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
	" `device_map` but it needs to be a dictionary module_name to device, so for instance {'encoder.block.0':"
	" 0, 'encoder.block.1': 1, ...}",
	FutureWarning,
	)
	self.device_map = (
	get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
	if device_map is None
	else device_map
	)
	assert_device_map(self.device_map, len(self.encoder.block))
	self.encoder.parallelize(self.device_map)
	self.decoder.parallelize(self.device_map)
	self.model_parallel = True


	def deparallelize(self):
	warnings.warn(
	"Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
	FutureWarning,
	)
	self.encoder.deparallelize()
	self.decoder.deparallelize()
	self.encoder = self.encoder.to("cpu")
	self.decoder = self.decoder.to("cpu")
	self.model_parallel = False
	self.device_map = None
	torch.cuda.empty_cache()

	def get_input_embeddings(self):
	return self.shared

	def set_input_embeddings(self, new_embeddings):
	self.shared = new_embeddings
	self.encoder.set_input_embeddings(new_embeddings)
	self.decoder.set_input_embeddings(new_embeddings)

	def _tie_weights(self):
	if self.config.tie_word_embeddings:
	self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared)
	self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared)

	def get_encoder(self):
	return self.encoder

	def get_decoder(self):
	return self.decoder

	def _prune_heads(self, heads_to_prune):
	"""
	Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
	class PreTrainedModel
	"""
	for layer, heads in heads_to_prune.items():
	self.encoder.layer[layer].attention.prune_heads(heads)

	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.FloatTensor] = None,
	decoder_input_ids: Optional[torch.LongTensor] = None,
	decoder_attention_mask: Optional[torch.BoolTensor] = None,
	head_mask: Optional[torch.FloatTensor] = None,
	decoder_head_mask: Optional[torch.FloatTensor] = None,
	cross_attn_head_mask: Optional[torch.Tensor] = None,
	encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
	past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
	inputs_embeds: Optional[torch.Tensor] = None,
	decoder_inputs_embeds: Optional[torch.Tensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]:
	r"""
	Returns:

	Example:

	```python
	>>> from transformers import AutoTokenizer, T5Model

	>>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
	>>> model = T5Model.from_pretrained("google-t5/t5-small")

	>>> input_ids = tokenizer(
	... "Studies have been shown that owning a dog is good for you", return_tensors="pt"
	... ).input_ids # Batch size 1
	>>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1

	>>> # preprocess: Prepend decoder_input_ids with start token which is pad token for T5Model.
	>>> # This is not needed for torch's T5ForConditionalGeneration as it does this internally using labels arg.
	>>> decoder_input_ids = model._shift_right(decoder_input_ids)

	>>> # forward pass
	>>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
	>>> last_hidden_states = outputs.last_hidden_state
	```"""
	use_cache = use_cache if use_cache is not None else self.config.use_cache
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	# FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
	if head_mask is not None and decoder_head_mask is None:
	if self.config.num_layers == self.config.num_decoder_layers:
	decoder_head_mask = head_mask

	# Encode if needed (training, first prediction pass)
	if encoder_outputs is None:
	encoder_outputs = self.encoder(
	input_ids=input_ids,
	attention_mask=attention_mask,
	inputs_embeds=inputs_embeds,
	head_mask=head_mask,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)
	elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
	encoder_outputs = BaseModelOutput(
	last_hidden_state=encoder_outputs[0],
	hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
	attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
	)

	hidden_states = encoder_outputs[0]

	# Set device for model parallelism
	if self.model_parallel:
	torch.cuda.set_device(self.decoder.first_device)
	hidden_states = hidden_states.to(self.decoder.first_device)
	if decoder_input_ids is not None:
	decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
	if attention_mask is not None:
	attention_mask = attention_mask.to(self.decoder.first_device)
	if decoder_attention_mask is not None:
	decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)

	# Decode
	decoder_outputs = self.decoder(
	input_ids=decoder_input_ids,
	attention_mask=decoder_attention_mask,
	inputs_embeds=decoder_inputs_embeds,
	past_key_values=past_key_values,
	encoder_hidden_states=hidden_states,
	encoder_attention_mask=attention_mask,
	head_mask=decoder_head_mask,
	cross_attn_head_mask=cross_attn_head_mask,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)

	if not return_dict:
	return decoder_outputs + encoder_outputs

	return Seq2SeqModelOutput(
	last_hidden_state=decoder_outputs.last_hidden_state,
	past_key_values=decoder_outputs.past_key_values,
	decoder_hidden_states=decoder_outputs.hidden_states,
	decoder_attentions=decoder_outputs.attentions,
	cross_attentions=decoder_outputs.cross_attentions,
	encoder_last_hidden_state=encoder_outputs.last_hidden_state,
	encoder_hidden_states=encoder_outputs.hidden_states,
	encoder_attentions=encoder_outputs.attentions,
	)



	class T5MIMOForConditionalGeneration(T5PreTrainedModel):
	_keys_to_ignore_on_load_unexpected = [
	"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
	]
	_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]

	def __init__(self, config: T5MIMOConfig):
	super().__init__(config)
	self.model_dim = config.d_model

	self.shared = nn.Embedding(config.vocab_size, config.d_model)

	encoder_config = copy.deepcopy(config)
	encoder_config.is_decoder = False
	encoder_config.use_cache = False
	encoder_config.is_encoder_decoder = False
	self.encoder = T5Stack(encoder_config, self.shared)

	decoder_config = copy.deepcopy(config)
	decoder_config.is_decoder = True
	decoder_config.is_encoder_decoder = False
	decoder_config.num_layers = config.num_decoder_layers
	self.decoder = T5Stack(decoder_config, self.shared)


	self.conv_block = MultivariateConvBlock(config)
	self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)

	# Initialize weights and apply final processing
	self.post_init()

	# Model parallel
	self.model_parallel = False
	self.device_map = None


	def parallelize(self, device_map=None):
	warnings.warn(
	"`T5ForConditionalGeneration.parallelize` is deprecated and will be removed in v5 of Transformers, you"
	" should load your model with `device_map='balanced'` in the call to `from_pretrained`. You can also"
	" provide your own `device_map` but it needs to be a dictionary module_name to device, so for instance"
	" {'encoder.block.0': 0, 'encoder.block.1': 1, ...}",
	FutureWarning,
	)
	self.device_map = (
	get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
	if device_map is None
	else device_map
	)
	assert_device_map(self.device_map, len(self.encoder.block))
	self.encoder.parallelize(self.device_map)
	self.decoder.parallelize(self.device_map)
	self.lm_head = self.lm_head.to(self.decoder.first_device)
	self.model_parallel = True


	def deparallelize(self):
	warnings.warn(
	"Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
	FutureWarning,
	)
	self.encoder.deparallelize()
	self.decoder.deparallelize()
	self.encoder = self.encoder.to("cpu")
	self.decoder = self.decoder.to("cpu")
	self.lm_head = self.lm_head.to("cpu")
	self.model_parallel = False
	self.device_map = None
	torch.cuda.empty_cache()

	def get_input_embeddings(self):
	return self.shared

	def set_input_embeddings(self, new_embeddings):
	self.shared = new_embeddings
	self.encoder.set_input_embeddings(new_embeddings)
	self.decoder.set_input_embeddings(new_embeddings)

	def _tie_weights(self):
	if self.config.tie_word_embeddings:
	self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared)
	self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared)

	def set_output_embeddings(self, new_embeddings):
	self.lm_head = new_embeddings

	def get_output_embeddings(self):
	return self.lm_head

	def get_encoder(self):
	return self.encoder

	def get_decoder(self):
	return self.decoder

	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.FloatTensor] = None,
	decoder_input_ids: Optional[torch.LongTensor] = None,
	decoder_attention_mask: Optional[torch.BoolTensor] = None,
	head_mask: Optional[torch.FloatTensor] = None,
	decoder_head_mask: Optional[torch.FloatTensor] = None,
	cross_attn_head_mask: Optional[torch.Tensor] = None,
	encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
	past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
	r"""
	labels (`torch.LongTensor` of shape `(batch_size,)`, optional):
	Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
	config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
	labels in `[0, ..., config.vocab_size]`

	Returns:

	Examples:

	```python
	>>> from transformers import AutoTokenizer, T5ForConditionalGeneration

	>>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
	>>> model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")

	>>> # training
	>>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
	>>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
	>>> outputs = model(input_ids=input_ids, labels=labels)
	>>> loss = outputs.loss
	>>> logits = outputs.logits

	>>> # inference
	>>> input_ids = tokenizer(
	... "summarize: studies have shown that owning a dog is good for you", return_tensors="pt"
	... ).input_ids # Batch size 1
	>>> outputs = model.generate(input_ids)
	>>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
	>>> # studies have shown that owning a dog is good for you.
	```"""
	use_cache = use_cache if use_cache is not None else self.config.use_cache
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	# FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
	if head_mask is not None and decoder_head_mask is None:
	if self.config.num_layers == self.config.num_decoder_layers:
	decoder_head_mask = head_mask

	# Encode if needed (training, first prediction pass)
	if encoder_outputs is None:
	# Convert encoder inputs in embeddings if needed
	encoder_outputs = self.encoder(
	input_ids=input_ids,
	attention_mask=attention_mask,
	inputs_embeds=inputs_embeds,
	head_mask=head_mask,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)
	elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
	encoder_outputs = BaseModelOutput(
	last_hidden_state=encoder_outputs[0],
	hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
	attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
	)

	hidden_states = encoder_outputs[0]

	if self.model_parallel:
	torch.cuda.set_device(self.decoder.first_device)

	if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
	# get decoder inputs from shifting lm labels to the right
	decoder_input_ids = self._shift_right(labels)

	# Set device for model parallelism
	if self.model_parallel:
	torch.cuda.set_device(self.decoder.first_device)
	hidden_states = hidden_states.to(self.decoder.first_device)
	if decoder_input_ids is not None:
	decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
	if attention_mask is not None:
	attention_mask = attention_mask.to(self.decoder.first_device)
	if decoder_attention_mask is not None:
	decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)

	# Decode
	decoder_outputs = self.decoder(
	input_ids=decoder_input_ids,
	attention_mask=decoder_attention_mask,
	inputs_embeds=decoder_inputs_embeds,
	past_key_values=past_key_values,
	encoder_hidden_states=hidden_states,
	encoder_attention_mask=attention_mask,
	head_mask=decoder_head_mask,
	cross_attn_head_mask=cross_attn_head_mask,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)

	sequence_output = decoder_outputs[0]

	# Set device for model parallelism
	if self.model_parallel:
	torch.cuda.set_device(self.encoder.first_device)
	self.lm_head = self.lm_head.to(self.encoder.first_device)
	sequence_output = sequence_output.to(self.lm_head.weight.device)

	if self.config.tie_word_embeddings:
	# Rescale output before projecting on vocab
	# See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
	sequence_output = sequence_output * (self.model_dim**-0.5)

	sequence_output = self.conv_block(sequence_output)
	lm_logits = self.lm_head(sequence_output)

	loss = None
	if labels is not None:
	loss_fct = CrossEntropyLoss(ignore_index=-100)
	# move labels to correct device to enable PP
	labels = labels.to(lm_logits.device)
	if len(labels.shape) == 2:
	loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
	else:
	loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.reshape(-1))
	# TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666

	if not return_dict:
	output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
	return ((loss,) + output) if loss is not None else output

	return Seq2SeqLMOutput(
	loss=loss,
	logits=lm_logits,
	past_key_values=decoder_outputs.past_key_values,
	decoder_hidden_states=decoder_outputs.hidden_states,
	decoder_attentions=decoder_outputs.attentions,
	cross_attentions=decoder_outputs.cross_attentions,
	encoder_last_hidden_state=encoder_outputs.last_hidden_state,
	encoder_hidden_states=encoder_outputs.hidden_states,
	encoder_attentions=encoder_outputs.attentions,
	)

	def prepare_inputs_for_generation(
	self,
	input_ids,
	past_key_values=None,
	attention_mask=None,
	head_mask=None,
	decoder_head_mask=None,
	decoder_attention_mask=None,
	cross_attn_head_mask=None,
	use_cache=None,
	encoder_outputs=None,
	**kwargs,
	):
	# cut decoder_input_ids if past_key_values is used
	if past_key_values is not None:
	past_length = past_key_values[0][0].shape[2]

	# Some generation methods already pass only the last input ID
	if input_ids.shape[1] > past_length:
	remove_prefix_length = past_length
	else:
	# Default to old behavior: keep only final ID
	remove_prefix_length = input_ids.shape[1] - 1

	input_ids = input_ids[:, remove_prefix_length:]

	return {
	"decoder_input_ids": input_ids,
	"past_key_values": past_key_values,
	"encoder_outputs": encoder_outputs,
	"attention_mask": attention_mask,
	"head_mask": head_mask,
	"decoder_head_mask": decoder_head_mask,
	"decoder_attention_mask": decoder_attention_mask,
	"cross_attn_head_mask": cross_attn_head_mask,
	"use_cache": use_cache,
	}

	def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
	return self._shift_right(labels)

	def _reorder_cache(self, past_key_values, beam_idx):
	# if decoder past is not included in output
	# speedy decoding is disabled and no need to reorder
	if past_key_values is None:
	logger.warning("You might want to consider setting `use_cache=True` to speed up decoding")
	return past_key_values

	reordered_decoder_past = ()
	for layer_past_states in past_key_values:
	# get the correct batch idx from layer past batch dim
	# batch dim of `past` is at 2nd position
	reordered_layer_past_states = ()
	for layer_past_state in layer_past_states:
	# need to set correct `past` for each of the four key / value states
	reordered_layer_past_states = reordered_layer_past_states + (
	layer_past_state.index_select(0, beam_idx.to(layer_past_state.device)),
	)

	if reordered_layer_past_states[0].shape != layer_past_states[0].shape:
	raise ValueError(
	f"reordered_layer_past_states[0] shape {reordered_layer_past_states[0].shape} and layer_past_states[0] shape {layer_past_states[0].shape} mismatched"
	)
	if len(reordered_layer_past_states) != len(layer_past_states):
	raise ValueError(
	f"length of reordered_layer_past_states {len(reordered_layer_past_states)} and length of layer_past_states {len(layer_past_states)} mismatched"
	)

	reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
	return reordered_decoder_past



	class T5MIMOEncoderModel(T5PreTrainedModel):
	_tied_weights_keys = ["encoder.embed_tokens.weight"]
	_keys_to_ignore_on_load_unexpected = [r"decoder"]

	def __init__(self, config: T5MIMOConfig):
	super().__init__(config)
	self.shared = nn.Embedding(config.vocab_size, config.d_model)

	encoder_config = copy.deepcopy(config)
	encoder_config.use_cache = False
	encoder_config.is_encoder_decoder = False
	self.encoder = T5Stack(encoder_config, self.shared)

	# Initialize weights and apply final processing
	self.post_init()

	# Model parallel
	self.model_parallel = False
	self.device_map = None

	def parallelize(self, device_map=None):
	warnings.warn(
	"`T5EncoderModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should load"
	" your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
	" `device_map` but it needs to be a dictionary module_name to device, so for instance {'block.0': 0,"
	" 'block.1': 1, ...}",
	FutureWarning,
	)
	self.device_map = (
	get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
	if device_map is None
	else device_map
	)
	assert_device_map(self.device_map, len(self.encoder.block))
	self.encoder.parallelize(self.device_map)
	self.model_parallel = True

	def deparallelize(self):
	warnings.warn(
	"Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
	FutureWarning,
	)
	self.encoder.deparallelize()
	self.encoder = self.encoder.to("cpu")
	self.model_parallel = False
	self.device_map = None
	torch.cuda.empty_cache()

	def get_input_embeddings(self):
	return self.shared

	def set_input_embeddings(self, new_embeddings):
	self.shared = new_embeddings
	self.encoder.set_input_embeddings(new_embeddings)

	def _tie_weights(self):
	if self.config.tie_word_embeddings:
	self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared)

	def get_encoder(self):
	return self.encoder

	def _prune_heads(self, heads_to_prune):
	"""
	Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
	class PreTrainedModel
	"""
	for layer, heads in heads_to_prune.items():
	self.encoder.block[layer].layer[0].SelfAttention.prune_heads(heads)

	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.FloatTensor] = None,
	head_mask: Optional[torch.FloatTensor] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[Tuple[torch.FloatTensor], BaseModelOutput]:
	r"""
	Returns:

	Example:

	```python
	>>> from transformers import AutoTokenizer, T5EncoderModel

	>>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
	>>> model = T5EncoderModel.from_pretrained("google-t5/t5-small")
	>>> input_ids = tokenizer(
	... "Studies have been shown that owning a dog is good for you", return_tensors="pt"
	... ).input_ids # Batch size 1
	>>> outputs = model(input_ids=input_ids)
	>>> last_hidden_states = outputs.last_hidden_state
	```"""
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	encoder_outputs = self.encoder(
	input_ids=input_ids,
	attention_mask=attention_mask,
	inputs_embeds=inputs_embeds,
	head_mask=head_mask,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)

	return encoder_outputs