GLM-t5-small / modeling_t5.py

Upload model

429485d verified 3 months ago

108 kB

	# coding=utf-8
	# Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the language governing permissions and
	# limitations under the License.
	"""This code is adapted from the T5 code on the Huggingface Transformers library."""
	""" PyTorch GLM T5 model."""


	import copy
	import math
	import os
	import warnings
	from typing import Optional, Tuple, Union
	import logging

	import torch
	from torch import nn
	from torch.nn import CrossEntropyLoss
	from torch.utils.checkpoint import checkpoint

	from transformers.activations import ACT2FN
	from transformers.modeling_outputs import (
	BaseModelOutput,
	BaseModelOutputWithPastAndCrossAttentions,
	Seq2SeqLMOutput,
	Seq2SeqModelOutput,
	)
	from transformers.modeling_utils import PreTrainedModel
	from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
	from transformers.utils import (
	DUMMY_INPUTS,
	DUMMY_MASK,
	add_start_docstrings,
	add_start_docstrings_to_model_forward,
	is_torch_fx_proxy,
	replace_return_docstrings,
	)
	from transformers.utils import logging as transformers_logging
	from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
	from .configuration_t5 import T5Config
	from .wrapper_functions import DataProcessor


	logger = transformers_logging.get_logger(__name__)

	_CONFIG_FOR_DOC = "T5Config"
	_CHECKPOINT_FOR_DOC = "t5-small"

	####################################################
	# This dict contains ids and associated url
	# for the pretrained weights provided with the models
	####################################################
	T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
	"t5-small",
	"t5-base",
	"t5-large",
	"t5-3b",
	"t5-11b",
	# See all T5 models at https://huggingface.co/models?filter=t5
	]


	####################################################
	# This is a conversion method from TF 1.0 to PyTorch
	# More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28
	####################################################
	def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
	"""Load tf checkpoints in a pytorch model."""
	raise NotImplementedError("NOT TESTED; might need adjustments for GLM")
	try:
	import re

	import numpy as np
	import tensorflow as tf
	except ImportError:
	logger.error(
	"Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
	"https://www.tensorflow.org/install/ for installation instructions."
	)
	raise
	tf_path = os.path.abspath(tf_checkpoint_path)
	logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
	# Load weights from TF model
	init_vars = tf.train.list_variables(tf_path)
	names = []
	tf_weights = {}
	for name, shape in init_vars:
	logger.info(f"Loading TF weight {name} with shape {shape}")
	array = tf.train.load_variable(tf_path, name)
	names.append(name)
	tf_weights[name] = array

	for txt_name in names:
	name = txt_name.split("/")
	# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
	# which are not required for using pretrained model
	if any(
	n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
	for n in name
	):
	logger.info(f"Skipping {'/'.join(name)}")
	tf_weights.pop(txt_name, None)
	continue
	if "_slot_" in name[-1]:
	logger.info(f"Skipping {'/'.join(name)}")
	tf_weights.pop(txt_name, None)
	continue
	pointer = model
	array = tf_weights[txt_name]

	for m_name in name:
	if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
	scope_names = re.split(r"_(\d+)", m_name)
	else:
	scope_names = [m_name]
	if scope_names[0] in ["kernel", "scale", "embedding"]:
	pointer = getattr(pointer, "weight")
	elif scope_names[0] == "self_attention":
	pointer = getattr(pointer, "layer")
	pointer = pointer[0]
	elif scope_names[0] == "enc_dec_attention":
	pointer = getattr(pointer, "layer")
	pointer = pointer[1]
	elif scope_names[0] == "dense_relu_dense":
	pointer = getattr(pointer, "layer")
	pointer = pointer[2]
	elif scope_names[0] == "rms_norm":
	if hasattr(pointer, "layer_norm"):
	pointer = getattr(pointer, "layer_norm")
	elif hasattr(pointer, "final_layer_norm"):
	pointer = getattr(pointer, "final_layer_norm")
	elif scope_names[0] == "scale":
	pointer = getattr(pointer, "weight")
	elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
	pointer = getattr(pointer, "bias")
	elif scope_names[0] == "squad":
	pointer = getattr(pointer, "classifier")
	elif scope_names[0] == "decoder" and name[1] == "logits":
	continue
	elif scope_names[0] == "logits":
	pointer = getattr(pointer, "lm_head")
	elif scope_names[0] == "wi" and len(scope_names) > 1 and scope_names[1].isdigit():
	pointer = getattr(pointer, f"wi_{scope_names[1]}")
	continue
	else:
	try:
	pointer = getattr(pointer, scope_names[0])
	except AttributeError:
	logger.info(f"Skipping {'/'.join(name)}")
	continue
	if len(scope_names) >= 2:
	num = int(scope_names[1])
	pointer = pointer[num]
	if scope_names[0] not in ["kernel", "scale", "embedding"]:
	pointer = getattr(pointer, "weight")
	if scope_names[0] != "embedding":
	logger.info(f"Transposing numpy weight of shape {array.shape} for {name}")
	array = np.transpose(array)
	try:
	assert (
	pointer.shape == array.shape
	), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
	except AssertionError as e:
	e.args += (pointer.shape, array.shape)
	raise
	logger.info(f"Initialize PyTorch weight {name}")
	pointer.data = torch.from_numpy(array.astype(np.float32))
	tf_weights.pop(txt_name, None)

	logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}.")
	return model


	####################################################
	# PyTorch Models are constructed by sub-classing
	# - torch.nn.Module for the layers and
	# - PreTrainedModel for the models (it-self a sub-class of nn.Module)
	####################################################
	PARALLELIZE_DOCSTRING = r"""
	This is an experimental feature and is a subject to change at a moment's notice.

	Uses a device map to distribute attention modules of the model across several devices. If no device map is given,
	it will evenly distribute blocks across all devices.

	Args:
	device_map (`Dict[int, list]`, optional, defaults to None):
	A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
	automatically mapped to the first device (for esoteric reasons). That means that the first device should
	have fewer attention modules mapped to it than other devices. For reference, the t5 models have the
	following number of attention modules:

	- t5-small: 6
	- t5-base: 12
	- t5-large: 24
	- t5-3b: 24
	- t5-11b: 24

	Example:

	```python
	# Here is an example of a device map on a machine with 4 GPUs using t5-3b, which has a total of 24 attention modules:
	model = T5ForConditionalGeneration.from_pretrained("t5-3b")
	device_map = {
	0: [0, 1, 2],
	1: [3, 4, 5, 6, 7, 8, 9],
	2: [10, 11, 12, 13, 14, 15, 16],
	3: [17, 18, 19, 20, 21, 22, 23],
	}
	model.parallelize(device_map)
	```
	"""
	DEPARALLELIZE_DOCSTRING = r"""
	Moves the model to cpu from a model parallel state.

	Example:

	```python
	# On a 4 GPU machine with t5-3b:
	model = T5ForConditionalGeneration.from_pretrained("t5-3b")
	device_map = {
	0: [0, 1, 2],
	1: [3, 4, 5, 6, 7, 8, 9],
	2: [10, 11, 12, 13, 14, 15, 16],
	3: [17, 18, 19, 20, 21, 22, 23],
	}
	model.parallelize(device_map) # Splits the model across several devices
	model.deparallelize() # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
	```
	"""


	class T5LayerNorm(nn.Module):
	def __init__(self, hidden_size, eps=1e-6):
	"""
	Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
	"""
	super().__init__()
	self.weight = nn.Parameter(torch.ones(hidden_size))
	self.variance_epsilon = eps

	def forward(self, hidden_states):
	# T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
	# Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
	# w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
	# half-precision inputs is done in fp32

	variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
	hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)

	# convert into half-precision if necessary
	if self.weight.dtype in [torch.float16, torch.bfloat16]:
	hidden_states = hidden_states.to(self.weight.dtype)

	return self.weight * hidden_states



	# try:
	# from apex.normalization import FusedRMSNorm

	# T5LayerNorm = FusedRMSNorm # noqa

	# logger.info("Discovered apex.normalization.FusedRMSNorm - will use it instead of T5LayerNorm")
	# except ImportError:
	# # using the normal T5LayerNorm
	# pass
	# except Exception:
	# logger.warning("discovered apex but it failed to load, falling back to T5LayerNorm")
	# pass

	ALL_LAYERNORM_LAYERS.append(T5LayerNorm)


	class T5DenseActDense(nn.Module):
	def __init__(self, config: T5Config):
	super().__init__()
	self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
	self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
	self.dropout = nn.Dropout(config.dropout_rate)
	self.act = ACT2FN[config.dense_act_fn]

	def forward(self, hidden_states):
	hidden_states = self.wi(hidden_states)
	hidden_states = self.act(hidden_states)
	hidden_states = self.dropout(hidden_states)
	if (
	isinstance(self.wo.weight, torch.Tensor)
	and hidden_states.dtype != self.wo.weight.dtype
	and self.wo.weight.dtype != torch.int8
	):
	hidden_states = hidden_states.to(self.wo.weight.dtype)
	hidden_states = self.wo(hidden_states)
	return hidden_states


	class T5DenseGatedActDense(nn.Module):
	def __init__(self, config: T5Config):
	super().__init__()
	self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
	self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
	self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
	self.dropout = nn.Dropout(config.dropout_rate)
	self.act = ACT2FN[config.dense_act_fn]

	def forward(self, hidden_states):
	hidden_gelu = self.act(self.wi_0(hidden_states))
	hidden_linear = self.wi_1(hidden_states)
	hidden_states = hidden_gelu * hidden_linear
	hidden_states = self.dropout(hidden_states)

	# To make 8bit quantization work for google/flan-t5-xxl, self.wo is kept in float32.
	# See https://github.com/huggingface/transformers/issues/20287
	# we also make sure the weights are not in `int8` in case users will force `_keep_in_fp32_modules` to be `None``
	if (
	isinstance(self.wo.weight, torch.Tensor)
	and hidden_states.dtype != self.wo.weight.dtype
	and self.wo.weight.dtype != torch.int8
	):
	hidden_states = hidden_states.to(self.wo.weight.dtype)

	hidden_states = self.wo(hidden_states)
	return hidden_states


	class T5LayerFF(nn.Module):
	def __init__(self, config: T5Config):
	super().__init__()
	if config.is_gated_act:
	self.DenseReluDense = T5DenseGatedActDense(config)
	else:
	self.DenseReluDense = T5DenseActDense(config)

	self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
	self.dropout = nn.Dropout(config.dropout_rate)

	def forward(self, hidden_states):
	forwarded_states = self.layer_norm(hidden_states)
	forwarded_states = self.DenseReluDense(forwarded_states)
	hidden_states = hidden_states + self.dropout(forwarded_states)
	return hidden_states


	class T5Attention(nn.Module):
	def __init__(self, config: T5Config, has_relative_attention_bias=False):
	super().__init__()
	self.is_decoder = config.is_decoder
	self.has_relative_attention_bias = has_relative_attention_bias

	self.relative_attention_num_buckets = config.relative_attention_num_buckets
	self.relative_attention_num_additional_buckets = config.relative_attention_num_additional_buckets if 'relative_attention_num_additional_buckets' in config.__dict__.keys() else 0
	self.relative_attention_max_distance = config.relative_attention_max_distance
	self.d_model = config.d_model
	self.key_value_proj_dim = config.d_kv
	self.n_heads = config.num_heads
	self.dropout = config.dropout_rate
	self.inner_dim = self.n_heads * self.key_value_proj_dim

	# Mesh TensorFlow initialization to avoid scaling before softmax
	self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
	self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
	self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
	self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)

	if self.has_relative_attention_bias:
	if self.is_decoder:
	num_buckets = self.relative_attention_num_buckets
	else:
	num_buckets = self.relative_attention_num_buckets + self.relative_attention_num_additional_buckets
	self.relative_attention_bias = nn.Embedding(num_buckets, self.n_heads)
	self.pruned_heads = set()
	self.gradient_checkpointing = False

	def prune_heads(self, heads):
	if len(heads) == 0:
	return
	heads, index = find_pruneable_heads_and_indices(
	heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads
	)
	# Prune linear layers
	self.q = prune_linear_layer(self.q, index)
	self.k = prune_linear_layer(self.k, index)
	self.v = prune_linear_layer(self.v, index)
	self.o = prune_linear_layer(self.o, index, dim=1)
	# Update hyper params
	self.n_heads = self.n_heads - len(heads)
	self.inner_dim = self.key_value_proj_dim * self.n_heads
	self.pruned_heads = self.pruned_heads.union(heads)

	@staticmethod
	def _og_relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
	"""
	This is the original implementation from T5 which will be called below.

	Adapted from Mesh Tensorflow:
	https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

	Translate relative position to a bucket number for relative attention. The relative position is defined as
	memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
	position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
	small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
	positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
	This should allow for more graceful generalization to longer sequences than the model has been trained on

	Args:
	relative_position: an int32 Tensor
	bidirectional: a boolean - whether the attention is bidirectional
	num_buckets: an integer
	max_distance: an integer

	Returns:
	a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
	"""
	relative_buckets = 0

	if bidirectional:
	num_buckets //= 2
	relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
	relative_position = torch.abs(relative_position)
	else:
	relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
	# now relative_position is in the range [0, inf)

	# half of the buckets are for exact increments in positions
	max_exact = num_buckets // 2
	is_small = relative_position < max_exact

	# The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
	relative_position_if_large = max_exact + (
	torch.log(relative_position.float() / max_exact)
	/ math.log(max_distance / max_exact)
	* (num_buckets - max_exact)
	).to(torch.long)
	relative_position_if_large = torch.min(
	relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
	)

	relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
	return relative_buckets

	@staticmethod
	def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128, use_additional_bucket=None):
	"""
	This function is a wrapper for _og_relative_position_bucket. It allows for additional buckets to be added via use_additional_bucket, which is a bool tensor of the same shape as relative_position. Entries where use_additional_bucket is False use the usual conventional buckets from relative_position. Entries where use_additional_bucket is True use the additional buckets. The additional buckets are added to the end of the bucket list, so the first additional bucket will be bucket `num_buckets`, the second will be `num_buckets + 1` and so on. Which one of the additional buckets is used depends on the value of relative_position. So if relative_position is 5, and use_additional_bucket is True, then the bucket used will be `num_buckets + 5`. If relative_position is 5, and use_additional_bucket is False, then the bucket corresponding to a relative position of 5 will be used. The standard relative PE buckets have indices in the range [0, num_buckets), so an entry of `0` in relative_position will use the first additional bucket if use_additional_bucket is True. Entries to relative_position where use_relative_PE is True should be non-negative intergers. Entries where use_relative_PE is False can be any integer.

	The additional buckets can be used to encode relative positions such as Graph to Graph for long-ranged connections (e.g. in the global GLM), Graph to Text, or Text to Graph.

	If use_additional_bucket is None (or a Tensor where all entries are False), then this function is equivalent to _relative_position_bucket

	Note that the embeddings are not initialized automatically when additional buckets are used, so they need to be initialized manually. This can be done by calling init_relative_position_bias.

	Params:
	use_additional_bucket: a Tensor with the same shape as relative_position and type torch.bool
	Returns:
	a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets + max(relative_position[use_additional_bucket]) + 1)
	"""

	relative_buckets = T5Attention._og_relative_position_bucket(relative_position, bidirectional, num_buckets, max_distance)

	if use_additional_bucket is None:
	return relative_buckets

	relative_buckets[use_additional_bucket] = relative_position[use_additional_bucket] + num_buckets

	return relative_buckets

	def compute_bias(self, query_length, key_length, device=None, relative_position=None, use_additional_bucket=None):
	"""Compute binned relative position bias
	:param relative_position: [MP] torch.long tensor in shape [query_length, key_length]
	"""
	if device is None:
	device = self.relative_attention_bias.weight.device

	if relative_position is None:
	context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
	memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
	relative_position = memory_position - context_position # shape (query_length, key_length)
	else:
	assert relative_position.dtype == torch.long, f"{relative_position.dtype} should be torch.long"
	assert relative_position.device == device, f"{relative_position.device} should be {device}"
	# relative_position = relative_position.to(device)

	relative_position_bucket = self._relative_position_bucket(
	relative_position, # shape (query_length, key_length)
	bidirectional=(not self.is_decoder),
	num_buckets=self.relative_attention_num_buckets,
	max_distance=self.relative_attention_max_distance,
	use_additional_bucket=use_additional_bucket,
	)

	values = self.relative_attention_bias(relative_position_bucket) # shape (query_length, key_length, num_heads)

	values = values.permute([2, 0, 1]).unsqueeze(0) # shape (query_length, key_length) # [MP] actually the shape is (1, num_heads, query_length, key_length)

	return values

	def forward(
	self,
	hidden_states,
	mask=None,
	key_value_states=None,
	position_bias=None,
	past_key_value=None,
	layer_head_mask=None,
	query_length=None,
	use_cache=False,
	output_attentions=False,
	relative_position=None,
	sparsity_mask=None,
	use_additional_bucket=None,
	):
	"""
	Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).

	:param relative_position: [MP] relative position for the attention. If `None`, it will be computed as in a standard sequence-to-sequence model. If not `None`, it will be used as the relative position for the attention. It is a tensor of shape [batch_size, query_length, key_length].
	:param sparsity_mask: [MP] sparsity mask for the attention. If `None`, it will be computed as in a standard sequence-to-sequence model. If not `None`, it will be used as the sparsity mask for the attention. It is a tensor of shape [batch_size, query_length, key_length]. A value of 1 means that the corresponding attention weight is not masked, and a value of 0 means that the corresponding attention weight is masked. Hence, the sparsity mask is a binary mask that (kind of) can be used like a multiplicative mask.
	:param use_additional_bucket: [MP] whether to use additional buckets for the attention. If `None`, only standard positional encodings will be used. If not `None`, additional buckets will be used for the relative position. It is a tensor of shape [batch_size, query_length, key_length]. A value of False means that the corresponding position is a standard relative position, and a value of True means that the corresponding additional bucket should be used.
	"""
	# Input is (batch_size, seq_length, dim)
	# Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
	# past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
	batch_size, seq_length = hidden_states.shape[:2]

	real_seq_length = seq_length

	if past_key_value is not None:
	assert (
	len(past_key_value) == 2
	), f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
	real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length

	key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]

	def shape(states):
	"""projection"""
	return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)

	def unshape(states):
	"""reshape"""
	return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)

	def project(hidden_states, proj_layer, key_value_states, past_key_value):
	"""projects hidden states correctly to key/query states"""
	if key_value_states is None:
	# self-attn
	# (batch_size, n_heads, seq_length, dim_per_head)
	hidden_states = shape(proj_layer(hidden_states))
	elif past_key_value is None:
	# cross-attn
	# (batch_size, n_heads, seq_length, dim_per_head)
	hidden_states = shape(proj_layer(key_value_states))

	if past_key_value is not None:
	if key_value_states is None:
	# self-attn
	# (batch_size, n_heads, key_length, dim_per_head)
	hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
	elif past_key_value.shape[2] != key_value_states.shape[1]:
	# checking that the `sequence_length` of the `past_key_value` is the same as
	# the provided `key_value_states` to support prefix tuning
	# cross-attn
	# (batch_size, n_heads, seq_length, dim_per_head)
	hidden_states = shape(proj_layer(key_value_states))
	else:
	# cross-attn
	hidden_states = past_key_value
	return hidden_states

	# get query states
	query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head)

	# get key/value states
	key_states = project(
	hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None
	)
	value_states = project(
	hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None
	)

	# compute scores
	scores = torch.matmul(
	query_states, key_states.transpose(3, 2)
	) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9

	if position_bias is None:
	if not self.has_relative_attention_bias:
	position_bias = torch.zeros(
	(1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
	)
	if self.gradient_checkpointing and self.training:
	position_bias.requires_grad = True
	else:
	if relative_position is None:
	assert use_additional_bucket is None
	position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device, use_additional_bucket=None)
	# position_bias = torch.cat(tuple(position_bias for _ in range(batch_size)), dim=0)
	else:
	position_bias = torch.cat(tuple(self.compute_bias(real_seq_length, key_length, device=scores.device, relative_position=r, use_additional_bucket=u) for r, u in zip(relative_position, use_additional_bucket)), dim=0)

	# if key and values are already calculated
	# we want only the last query position bias
	if past_key_value is not None:
	position_bias = position_bias[:, :, -hidden_states.size(1) :, :]

	logging.debug(f"position_bias = {position_bias.shape if position_bias is not None else position_bias}")
	logging.debug(f"mask = {mask.shape if mask is not None else mask}")
	if mask is not None:
	position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length)

	if self.pruned_heads:
	mask = torch.ones(position_bias.shape[1])
	mask[list(self.pruned_heads)] = 0
	position_bias_masked = position_bias[:, mask.bool()]
	else:
	position_bias_masked = position_bias

	if sparsity_mask is not None:
	assert sparsity_mask.dtype == torch.bool, f"{relative_position.dtype} should be torch.bool"

	sparsity_mask = ~ sparsity_mask.unsqueeze(1) # add extra dimension for heads and negate for indexing the masked positions
	sparsity_mask = sparsity_mask.expand_as(position_bias_masked)

	scores += position_bias_masked

	# >>> masked softmax >>>
	if sparsity_mask is not None:
	scores[sparsity_mask] = float('-inf') # this works in the backward pass, because potential nan values that the softmax produces in the forward pass are not used in backpropagation, because the "=" is independent of the value that the entry had previously. This is not the case for "+=", which is why we need to set the values to -inf instead of adding -inf.

	# (batch_size, n_heads, seq_length, key_length)
	attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
	scores
	) # (batch_size, n_heads, seq_length, key_length)

	# replace nan values in the attention weights with 0. nan happens if all positions are masked for one token, as then all inputs to softmax are -inf for that token
	attn_weights = torch.nan_to_num(attn_weights, nan=0.0)
	# <<< masked softmax <<<

	attn_weights = nn.functional.dropout(
	attn_weights, p=self.dropout, training=self.training
	) # (batch_size, n_heads, seq_length, key_length)

	# Mask heads if we want to
	if layer_head_mask is not None:
	attn_weights = attn_weights * layer_head_mask

	attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim)
	attn_output = self.o(attn_output)

	present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
	outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)

	if output_attentions:
	outputs = outputs + (attn_weights,)
	return outputs

	def init_relative_position_bias(self, modelsize:str, is_decoder:bool=False, init_additional_buckets_from:list[int]=None):
	"""
	Initializes parameters for relative position bias. This is necessary, if additional buckets are used, as then the weights are not initialized automatically when calling `from_pretrained`.
	:param modelsize: the model size of the model from which the relative position bias should be inherited
	:param is_decoder: whether the SelfAttention is in the decoder or not. This determines whether the relative position bias is initialized from the parents encoder or decoder.
	:param init_additional_buckets_from: the indices of the buckets from which the additional buckets should be initialized. If this is an int, then all additional buckets are initialized from the same bucket. If this is a list, then the list should have the same length as the number of additional buckets, and the i-th entry of the list determines from which bucket the i-th additional bucket is initialized. Setting this to None (or an element in the list to None) means that the additional bucket is not initialized, i.e. it is left unchanged.
	"""
	if self.is_decoder:
	raise NotImplementedError("Decoder is not tested.")

	logging.debug('Loading model from which relative position bias should be inherited')
	parent_model = T5EncoderModel.from_pretrained(modelsize) # Needs to be changed for decoder support

	logging.debug('Get relative position bias from parent model')
	if is_decoder:
	parent_bias = parent_model.decoder.block[0].layer[0].SelfAttention.relative_attention_bias.weight
	else:
	parent_bias = parent_model.encoder.block[0].layer[0].SelfAttention.relative_attention_bias.weight
	del parent_model # free memory

	assert parent_bias.shape[1] == self.relative_attention_bias.weight.shape[1], f"{parent_bias.shape[1]} should be {self.relative_attention_bias.weight.shape[1]}"
	assert parent_bias.shape[0] <= self.relative_attention_bias.weight.shape[0], f"{parent_bias.shape[0]} should be <= {self.relative_attention_bias.weight.shape[0]}"

	logging.debug('init normal buckets')
	with torch.no_grad():
	self.relative_attention_bias.weight[:parent_bias.shape[0], :] = parent_bias

	logging.debug('get parent buckets for additional buckets')
	if init_additional_buckets_from is None:
	return
	num_additional_buckets = self.relative_attention_bias.weight.shape[0] - parent_bias.shape[0]
	if num_additional_buckets == 0:
	return
	if not isinstance(init_additional_buckets_from, list):
	init_additional_buckets_from = [init_additional_buckets_from]*num_additional_buckets
	assert len(init_additional_buckets_from) == num_additional_buckets, f"{len(init_additional_buckets_from)} should be {num_additional_buckets}"

	skip_bucket = [idx is None for idx in init_additional_buckets_from]
	init_additional_buckets_from = [0 if idx is None else idx for idx in init_additional_buckets_from]
	init_additional_buckets_from = torch.tensor(init_additional_buckets_from, dtype=torch.long)
	init_additional_buckets_from = self._relative_position_bucket(relative_position=init_additional_buckets_from, bidirectional=(not is_decoder), num_buckets=self.relative_attention_num_buckets, max_distance=self.relative_attention_max_distance, use_additional_bucket=None)

	logging.debug('Initialize relative position bias')
	with torch.no_grad():
	for i, (skip, idx) in enumerate(zip(skip_bucket, init_additional_buckets_from), start=parent_bias.shape[0]):
	if skip:
	continue
	self.relative_attention_bias.weight[i, :] = parent_bias[idx, :]


	class T5LayerSelfAttention(nn.Module):
	def __init__(self, config, has_relative_attention_bias=False):
	super().__init__()
	self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
	self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
	self.dropout = nn.Dropout(config.dropout_rate)

	def forward(
	self,
	hidden_states,
	attention_mask=None,
	position_bias=None,
	layer_head_mask=None,
	past_key_value=None,
	use_cache=False,
	output_attentions=False,
	relative_position=None,
	sparsity_mask=None,
	use_additional_bucket=None,
	):
	logging.debug('### SelfAttention ###')
	normed_hidden_states = self.layer_norm(hidden_states)
	attention_output = self.SelfAttention(
	normed_hidden_states,
	mask=attention_mask,
	position_bias=position_bias,
	layer_head_mask=layer_head_mask,
	past_key_value=past_key_value,
	use_cache=use_cache,
	output_attentions=output_attentions,
	relative_position=relative_position,
	sparsity_mask=sparsity_mask,
	use_additional_bucket=use_additional_bucket,
	)
	hidden_states = hidden_states + self.dropout(attention_output[0])
	outputs = (hidden_states,) + attention_output[1:] # add attentions if we output them
	return outputs


	class T5LayerCrossAttention(nn.Module):
	def __init__(self, config):
	raise NotImplementedError("might need adjustments for GLM")
	super().__init__()
	self.EncDecAttention = T5Attention(config, has_relative_attention_bias=False)
	self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
	self.dropout = nn.Dropout(config.dropout_rate)

	def forward(
	self,
	hidden_states,
	key_value_states,
	attention_mask=None,
	position_bias=None,
	layer_head_mask=None,
	past_key_value=None,
	use_cache=False,
	query_length=None,
	output_attentions=False,
	):
	logging.debug('### CrossAttention ###')
	normed_hidden_states = self.layer_norm(hidden_states)
	attention_output = self.EncDecAttention(
	normed_hidden_states,
	mask=attention_mask,
	key_value_states=key_value_states,
	position_bias=position_bias,
	layer_head_mask=layer_head_mask,
	past_key_value=past_key_value,
	use_cache=use_cache,
	query_length=query_length,
	output_attentions=output_attentions,
	)
	layer_output = hidden_states + self.dropout(attention_output[0])
	outputs = (layer_output,) + attention_output[1:] # add attentions if we output them
	return outputs


	class T5Block(nn.Module):
	def __init__(self, config, has_relative_attention_bias=False):
	super().__init__()
	self.is_decoder = config.is_decoder
	self.layer = nn.ModuleList()
	self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias))
	if self.is_decoder:
	self.layer.append(T5LayerCrossAttention(config))

	self.layer.append(T5LayerFF(config))

	def forward(
	self,
	hidden_states,
	attention_mask=None,
	position_bias=None,
	encoder_hidden_states=None,
	encoder_attention_mask=None,
	encoder_decoder_position_bias=None,
	layer_head_mask=None,
	cross_attn_layer_head_mask=None,
	past_key_value=None,
	use_cache=False,
	output_attentions=False,
	return_dict=True,
	relative_position=None,
	sparsity_mask=None,
	use_additional_bucket=None,
	):
	if past_key_value is not None:
	if not self.is_decoder:
	logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.")
	expected_num_past_key_values = 2 if encoder_hidden_states is None else 4

	if len(past_key_value) != expected_num_past_key_values:
	raise ValueError(
	f"There should be {expected_num_past_key_values} past states. "
	f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
	f"Got {len(past_key_value)} past key / value states"
	)

	self_attn_past_key_value = past_key_value[:2]
	cross_attn_past_key_value = past_key_value[2:]
	else:
	self_attn_past_key_value, cross_attn_past_key_value = None, None

	self_attention_outputs = self.layer[0](
	hidden_states,
	attention_mask=attention_mask,
	position_bias=position_bias,
	layer_head_mask=layer_head_mask,
	past_key_value=self_attn_past_key_value,
	use_cache=use_cache,
	output_attentions=output_attentions,
	relative_position=relative_position,
	sparsity_mask=sparsity_mask,
	use_additional_bucket=use_additional_bucket,
	)
	hidden_states, present_key_value_state = self_attention_outputs[:2]
	attention_outputs = self_attention_outputs[2:] # Keep self-attention outputs and relative position weights

	# clamp inf values to enable fp16 training
	if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
	clamp_value = torch.finfo(hidden_states.dtype).max - 1000
	hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)

	do_cross_attention = self.is_decoder and encoder_hidden_states is not None
	if do_cross_attention:
	# the actual query length is unknown for cross attention
	# if using past key value states. Need to inject it here
	if present_key_value_state is not None:
	query_length = present_key_value_state[0].shape[2]
	else:
	query_length = None

	cross_attention_outputs = self.layer[1](
	hidden_states,
	key_value_states=encoder_hidden_states,
	attention_mask=encoder_attention_mask,
	position_bias=encoder_decoder_position_bias,
	layer_head_mask=cross_attn_layer_head_mask,
	past_key_value=cross_attn_past_key_value,
	query_length=query_length,
	use_cache=use_cache,
	output_attentions=output_attentions,
	)
	hidden_states = cross_attention_outputs[0]

	# clamp inf values to enable fp16 training
	if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
	clamp_value = torch.finfo(hidden_states.dtype).max - 1000
	hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)

	# Combine self attn and cross attn key value states
	if present_key_value_state is not None:
	present_key_value_state = present_key_value_state + cross_attention_outputs[1]

	# Keep cross-attention outputs and relative position weights
	attention_outputs = attention_outputs + cross_attention_outputs[2:]

	# Apply Feed Forward layer
	hidden_states = self.layer[-1](hidden_states)

	# clamp inf values to enable fp16 training
	if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
	clamp_value = torch.finfo(hidden_states.dtype).max - 1000
	hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)

	outputs = (hidden_states,)

	if use_cache:
	outputs = outputs + (present_key_value_state,) + attention_outputs
	else:
	outputs = outputs + attention_outputs

	return outputs # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)


	class T5PreTrainedModel(PreTrainedModel):
	"""
	An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
	models.
	"""

	config_class = T5Config
	load_tf_weights = load_tf_weights_in_t5
	base_model_prefix = "transformer"
	is_parallelizable = True
	supports_gradient_checkpointing = True
	_no_split_modules = ["T5Block"]
	_keep_in_fp32_modules = ["wo"]
	data_processor = DataProcessor

	@property
	def dummy_inputs(self):
	input_ids = torch.tensor(DUMMY_INPUTS)
	input_mask = torch.tensor(DUMMY_MASK)
	dummy_inputs = {
	"decoder_input_ids": input_ids,
	"input_ids": input_ids,
	"decoder_attention_mask": input_mask,
	}
	return dummy_inputs

	def _init_weights(self, module):
	"""Initialize the weights"""
	factor = self.config.initializer_factor # Used for testing weights initialization
	if isinstance(module, T5LayerNorm):
	module.weight.data.fill_(factor * 1.0)
	elif isinstance(module, (T5Model, T5ForConditionalGeneration, T5EncoderModel)):
	# Mesh TensorFlow embeddings initialization
	# See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
	module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
	if hasattr(module, "lm_head") and not self.config.tie_word_embeddings:
	module.lm_head.weight.data.normal_(mean=0.0, std=factor * 1.0)
	elif isinstance(module, T5DenseActDense):
	# Mesh TensorFlow FF initialization
	# See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
	# and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
	module.wi.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
	if hasattr(module.wi, "bias") and module.wi.bias is not None:
	module.wi.bias.data.zero_()
	module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
	if hasattr(module.wo, "bias") and module.wo.bias is not None:
	module.wo.bias.data.zero_()
	elif isinstance(module, T5DenseGatedActDense):
	module.wi_0.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
	if hasattr(module.wi_0, "bias") and module.wi_0.bias is not None:
	module.wi_0.bias.data.zero_()
	module.wi_1.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
	if hasattr(module.wi_1, "bias") and module.wi_1.bias is not None:
	module.wi_1.bias.data.zero_()
	module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
	if hasattr(module.wo, "bias") and module.wo.bias is not None:
	module.wo.bias.data.zero_()
	elif isinstance(module, T5Attention):
	# Mesh TensorFlow attention initialization to avoid scaling before softmax
	# See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
	d_model = self.config.d_model
	key_value_proj_dim = self.config.d_kv
	n_heads = self.config.num_heads
	module.q.weight.data.normal_(mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5))
	module.k.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
	module.v.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
	module.o.weight.data.normal_(mean=0.0, std=factor * ((n_heads * key_value_proj_dim) ** -0.5))
	if module.has_relative_attention_bias:
	module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5))

	def _set_gradient_checkpointing(self, module, value=False):
	if isinstance(module, (T5Attention, T5Stack)):
	module.gradient_checkpointing = value

	def _shift_right(self, input_ids):
	decoder_start_token_id = self.config.decoder_start_token_id
	pad_token_id = self.config.pad_token_id

	assert decoder_start_token_id is not None, (
	"self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id."
	" See T5 docs for more information"
	)

	# shift inputs to the right
	if is_torch_fx_proxy(input_ids):
	# Item assignment is not supported natively for proxies.
	shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id)
	shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
	else:
	shifted_input_ids = input_ids.new_zeros(input_ids.shape)
	shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
	shifted_input_ids[..., 0] = decoder_start_token_id

	assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
	# replace possible -100 values in labels by `pad_token_id`
	shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)

	return shifted_input_ids


	class T5Stack(T5PreTrainedModel):
	def __init__(self, config, embed_tokens=None):
	super().__init__(config)

	self.embed_tokens = embed_tokens
	self.is_decoder = config.is_decoder

	self.block = nn.ModuleList(
	[T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
	)
	self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
	self.dropout = nn.Dropout(config.dropout_rate)

	# Initialize weights and apply final processing
	self.post_init()
	# Model parallel
	self.model_parallel = False
	self.device_map = None
	self.gradient_checkpointing = False

	@add_start_docstrings(PARALLELIZE_DOCSTRING)
	def parallelize(self, device_map=None):
	warnings.warn(
	"`T5Stack.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model"
	" with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
	" `device_map` but it needs to be a dictionary module_name to device, so for instance {'block.0': 0,"
	" 'block.1': 1, ...}",
	FutureWarning,
	)
	# Check validity of device_map
	self.device_map = (
	get_device_map(len(self.block), range(torch.cuda.device_count())) if device_map is None else device_map
	)
	assert_device_map(self.device_map, len(self.block))
	self.model_parallel = True
	self.first_device = "cpu" if "cpu" in self.device_map.keys() else "cuda:" + str(min(self.device_map.keys()))
	self.last_device = "cuda:" + str(max(self.device_map.keys()))
	# Load onto devices
	for k, v in self.device_map.items():
	for layer in v:
	cuda_device = "cuda:" + str(k)
	self.block[layer] = self.block[layer].to(cuda_device)

	# Set embed_tokens to first layer
	self.embed_tokens = self.embed_tokens.to(self.first_device)
	# Set final layer norm to last device
	self.final_layer_norm = self.final_layer_norm.to(self.last_device)

	@add_start_docstrings(DEPARALLELIZE_DOCSTRING)
	def deparallelize(self):
	warnings.warn(
	"Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
	FutureWarning,
	)
	self.model_parallel = False
	self.device_map = None
	self.first_device = "cpu"
	self.last_device = "cpu"
	for i in range(len(self.block)):
	self.block[i] = self.block[i].to("cpu")
	self.embed_tokens = self.embed_tokens.to("cpu")
	self.final_layer_norm = self.final_layer_norm.to("cpu")
	torch.cuda.empty_cache()

	def get_input_embeddings(self):
	return self.embed_tokens

	def set_input_embeddings(self, new_embeddings):
	self.embed_tokens = new_embeddings

	def forward(
	self,
	input_ids=None,
	attention_mask=None,
	encoder_hidden_states=None,
	encoder_attention_mask=None,
	inputs_embeds=None,
	head_mask=None,
	cross_attn_head_mask=None,
	past_key_values=None,
	use_cache=None,
	output_attentions=None,
	output_hidden_states=None,
	return_dict=None,
	relative_position=None,
	sparsity_mask=None,
	use_additional_bucket=None,
	):
	# Model parallel
	if self.model_parallel:
	torch.cuda.set_device(self.first_device)
	self.embed_tokens = self.embed_tokens.to(self.first_device)
	use_cache = use_cache if use_cache is not None else self.config.use_cache
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	if input_ids is not None and inputs_embeds is not None:
	err_msg_prefix = "decoder_" if self.is_decoder else ""
	raise ValueError(
	f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
	)
	elif input_ids is not None:
	input_shape = input_ids.size()
	input_ids = input_ids.view(-1, input_shape[-1])
	elif inputs_embeds is not None:
	input_shape = inputs_embeds.size()[:-1]
	else:
	err_msg_prefix = "decoder_" if self.is_decoder else ""
	raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds")

	if inputs_embeds is None:
	assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings"
	inputs_embeds = self.embed_tokens(input_ids)

	batch_size, seq_length = input_shape

	# required mask seq length can be calculated via length of past
	mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length

	if use_cache is True:
	assert self.is_decoder, f"`use_cache` can only be set to `True` if {self} is used as a decoder"

	if attention_mask is None:
	attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
	if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None:
	encoder_seq_length = encoder_hidden_states.shape[1]
	encoder_attention_mask = torch.ones(
	batch_size, encoder_seq_length, device=inputs_embeds.device, dtype=torch.long
	)

	# initialize past_key_values with `None` if past does not exist
	if past_key_values is None:
	past_key_values = [None] * len(self.block)

	# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
	# ourselves in which case we just need to make it broadcastable to all heads.
	extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)

	# If a 2D or 3D attention mask is provided for the cross-attention
	# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
	if self.is_decoder and encoder_hidden_states is not None:
	encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
	encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
	if encoder_attention_mask is None:
	encoder_attention_mask = torch.ones(encoder_hidden_shape, device=inputs_embeds.device)
	encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
	else:
	encoder_extended_attention_mask = None

	if self.gradient_checkpointing and self.training:
	if use_cache:
	logger.warning_once(
	"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
	)
	use_cache = False

	# Prepare head mask if needed
	head_mask = self.get_head_mask(head_mask, self.config.num_layers)
	cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
	present_key_value_states = () if use_cache else None
	all_hidden_states = () if output_hidden_states else None
	all_attentions = () if output_attentions else None
	all_cross_attentions = () if (output_attentions and self.is_decoder) else None
	position_bias = None
	encoder_decoder_position_bias = None

	hidden_states = self.dropout(inputs_embeds)

	for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
	layer_head_mask = head_mask[i]
	cross_attn_layer_head_mask = cross_attn_head_mask[i]
	# Model parallel
	if self.model_parallel:
	torch.cuda.set_device(hidden_states.device)
	# Ensure that attention_mask is always on the same device as hidden_states
	if attention_mask is not None:
	attention_mask = attention_mask.to(hidden_states.device)
	if position_bias is not None:
	position_bias = position_bias.to(hidden_states.device)
	if encoder_hidden_states is not None:
	encoder_hidden_states = encoder_hidden_states.to(hidden_states.device)
	if encoder_extended_attention_mask is not None:
	encoder_extended_attention_mask = encoder_extended_attention_mask.to(hidden_states.device)
	if encoder_decoder_position_bias is not None:
	encoder_decoder_position_bias = encoder_decoder_position_bias.to(hidden_states.device)
	if layer_head_mask is not None:
	layer_head_mask = layer_head_mask.to(hidden_states.device)
	if cross_attn_layer_head_mask is not None:
	cross_attn_layer_head_mask = cross_attn_layer_head_mask.to(hidden_states.device)
	if output_hidden_states:
	all_hidden_states = all_hidden_states + (hidden_states,)

	if self.gradient_checkpointing and self.training:

	def create_custom_forward(module):
	def custom_forward(*inputs):
	return tuple(module(*inputs, use_cache, output_attentions))

	return custom_forward

	layer_outputs = checkpoint(
	create_custom_forward(layer_module),
	hidden_states,
	extended_attention_mask,
	position_bias,
	encoder_hidden_states,
	encoder_extended_attention_mask,
	encoder_decoder_position_bias,
	layer_head_mask,
	cross_attn_layer_head_mask,
	None, # past_key_value is always None with gradient checkpointing
	relative_position=relative_position,
	sparsity_mask=sparsity_mask,
	use_additional_bucket=use_additional_bucket,
	)
	else:
	layer_outputs = layer_module(
	hidden_states,
	attention_mask=extended_attention_mask,
	position_bias=position_bias,
	encoder_hidden_states=encoder_hidden_states,
	encoder_attention_mask=encoder_extended_attention_mask,
	encoder_decoder_position_bias=encoder_decoder_position_bias,
	layer_head_mask=layer_head_mask,
	cross_attn_layer_head_mask=cross_attn_layer_head_mask,
	past_key_value=past_key_value,
	use_cache=use_cache,
	output_attentions=output_attentions,
	relative_position=relative_position,
	sparsity_mask=sparsity_mask,
	use_additional_bucket=use_additional_bucket,
	)

	# layer_outputs is a tuple with:
	# hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
	if use_cache is False:
	layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:]

	hidden_states, present_key_value_state = layer_outputs[:2]

	# We share the position biases between the layers - the first layer store them
	# layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
	# (cross-attention position bias), (cross-attention weights)
	position_bias = layer_outputs[2]
	if self.is_decoder and encoder_hidden_states is not None:
	encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3]
	# append next layer key value states
	if use_cache:
	present_key_value_states = present_key_value_states + (present_key_value_state,)

	if output_attentions:
	all_attentions = all_attentions + (layer_outputs[3],)
	if self.is_decoder:
	all_cross_attentions = all_cross_attentions + (layer_outputs[5],)

	# Model Parallel: If it's the last layer for that device, put things on the next device
	if self.model_parallel:
	for k, v in self.device_map.items():
	if i == v[-1] and "cuda:" + str(k) != self.last_device:
	hidden_states = hidden_states.to("cuda:" + str(k + 1))

	hidden_states = self.final_layer_norm(hidden_states)
	hidden_states = self.dropout(hidden_states)

	# Add last layer
	if output_hidden_states:
	all_hidden_states = all_hidden_states + (hidden_states,)

	if not return_dict:
	return tuple(
	v
	for v in [
	hidden_states,
	present_key_value_states,
	all_hidden_states,
	all_attentions,
	all_cross_attentions,
	]
	if v is not None
	)
	return BaseModelOutputWithPastAndCrossAttentions(
	last_hidden_state=hidden_states,
	past_key_values=present_key_value_states,
	hidden_states=all_hidden_states,
	attentions=all_attentions,
	cross_attentions=all_cross_attentions,
	)


	T5_START_DOCSTRING = r"""

	The T5 model was proposed in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text
	Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan
	Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. It's an encoder decoder transformer pre-trained in a
	text-to-text denoising generative setting.

	This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
	library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
	etc.)

	This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
	Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
	and behavior.

	Parameters:
	config ([`T5Config`]): Model configuration class with all the parameters of the model.
	Initializing with a config file does not load the weights associated with the model, only the
	configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
	"""

	T5_INPUTS_DOCSTRING = r"""
	Args:
	input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
	Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
	should be able to pad the inputs on both the right and the left.

	Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
	[`PreTrainedTokenizer.__call__`] for detail.

	[What are input IDs?](../glossary#input-ids)

	To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).
	attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, optional):
	Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

	- 1 for tokens that are not masked,
	- 0 for tokens that are masked.

	[What are attention masks?](../glossary#attention-mask)
	decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, optional):
	Indices of decoder input sequence tokens in the vocabulary.

	Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
	[`PreTrainedTokenizer.__call__`] for details.

	[What are decoder input IDs?](../glossary#decoder-input-ids)

	T5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
	is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

	To know more on how to prepare `decoder_input_ids` for pretraining take a look at [T5
	Training](./t5#training).
	decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, optional):
	Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
	be used by default.
	head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, optional):
	Mask to nullify selected heads of the self-attention modules in the encoder. Mask values selected in `[0,
	1]`:

	- 1 indicates the head is not masked,
	- 0 indicates the head is masked.

	decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, optional):
	Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
	1]`:

	- 1 indicates the head is not masked,
	- 0 indicates the head is masked.

	cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, optional):
	Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
	`[0, 1]`:

	- 1 indicates the head is not masked,
	- 0 indicates the head is masked.

	encoder_outputs (`tuple(tuple(torch.FloatTensor)`, optional):
	Tuple consists of (`last_hidden_state`, `optional`: hidden_states, `optional`: attentions)
	`last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` is a sequence of hidden states at
	the output of the last layer of the encoder. Used in the cross-attention of the decoder.
	past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
	Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.

	If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
	don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
	`decoder_input_ids` of shape `(batch_size, sequence_length)`.
	inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, optional):
	Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
	is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
	model's internal embedding lookup matrix.
	decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, optional):
	Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
	representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
	input (see `past_key_values`). This is useful if you want more control over how to convert
	`decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.

	If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
	of `inputs_embeds`.

	use_cache (`bool`, optional):
	If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
	`past_key_values`).

	output_attentions (`bool`, optional):
	Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
	tensors for more detail.
	output_hidden_states (`bool`, optional):
	Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
	more detail.
	return_dict (`bool`, optional):
	Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
	"""

	T5_ENCODER_INPUTS_DOCSTRING = r"""
	Args:
	input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
	Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
	should be able to pad the inputs on both the right and the left.

	Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
	[`PreTrainedTokenizer.__call__`] for detail.

	To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).
	attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, optional):
	Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

	- 1 for tokens that are not masked,
	- 0 for tokens that are masked.

	[What are attention masks?](../glossary#attention-mask)
	head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, optional):
	Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

	- 1 indicates the head is not masked,
	- 0 indicates the head is masked.

	inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, optional):
	Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
	is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
	model's internal embedding lookup matrix.
	output_attentions (`bool`, optional):
	Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
	tensors for more detail.
	output_hidden_states (`bool`, optional):
	Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
	more detail.
	return_dict (`bool`, optional):
	Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
	"""

	# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
	__HEAD_MASK_WARNING_MSG = """
	The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
	`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.
	If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers,
	num_heads)`.
	"""


	@add_start_docstrings(
	"The bare T5 Model transformer outputting raw hidden-states without any specific head on top.",
	T5_START_DOCSTRING,
	)
	class T5Model(T5PreTrainedModel):
	_keys_to_ignore_on_load_missing = [
	r"encoder.embed_tokens.weight",
	r"decoder.embed_tokens.weight",
	]
	_keys_to_ignore_on_load_unexpected = [
	r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
	]

	def __init__(self, config: T5Config):
	raise NotImplementedError("might need adjustments for GLM")
	super().__init__(config)
	self.shared = nn.Embedding(config.vocab_size, config.d_model)

	encoder_config = copy.deepcopy(config)
	encoder_config.is_decoder = False
	encoder_config.use_cache = False
	encoder_config.is_encoder_decoder = False
	self.encoder = T5Stack(encoder_config, self.shared)

	decoder_config = copy.deepcopy(config)
	decoder_config.is_decoder = True
	decoder_config.is_encoder_decoder = False
	decoder_config.num_layers = config.num_decoder_layers
	self.decoder = T5Stack(decoder_config, self.shared)

	# Initialize weights and apply final processing
	self.post_init()

	# Model parallel
	self.model_parallel = False
	self.device_map = None

	@add_start_docstrings(PARALLELIZE_DOCSTRING)
	def parallelize(self, device_map=None):
	warnings.warn(
	"`T5Model.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model"
	" with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
	" `device_map` but it needs to be a dictionary module_name to device, so for instance {'encoder.block.0':"
	" 0, 'encoder.block.1': 1, ...}",
	FutureWarning,
	)
	self.device_map = (
	get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
	if device_map is None
	else device_map
	)
	assert_device_map(self.device_map, len(self.encoder.block))
	self.encoder.parallelize(self.device_map)
	self.decoder.parallelize(self.device_map)
	self.model_parallel = True

	@add_start_docstrings(DEPARALLELIZE_DOCSTRING)
	def deparallelize(self):
	warnings.warn(
	"Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
	FutureWarning,
	)
	self.encoder.deparallelize()
	self.decoder.deparallelize()
	self.encoder = self.encoder.to("cpu")
	self.decoder = self.decoder.to("cpu")
	self.model_parallel = False
	self.device_map = None
	torch.cuda.empty_cache()

	def get_input_embeddings(self):
	return self.shared

	def set_input_embeddings(self, new_embeddings):
	self.shared = new_embeddings
	self.encoder.set_input_embeddings(new_embeddings)
	self.decoder.set_input_embeddings(new_embeddings)

	def get_encoder(self):
	return self.encoder

	def get_decoder(self):
	return self.decoder

	def _prune_heads(self, heads_to_prune):
	"""
	Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
	class PreTrainedModel
	"""
	for layer, heads in heads_to_prune.items():
	self.encoder.layer[layer].attention.prune_heads(heads)

	@add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
	@replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.FloatTensor] = None,
	decoder_input_ids: Optional[torch.LongTensor] = None,
	decoder_attention_mask: Optional[torch.BoolTensor] = None,
	head_mask: Optional[torch.FloatTensor] = None,
	decoder_head_mask: Optional[torch.FloatTensor] = None,
	cross_attn_head_mask: Optional[torch.Tensor] = None,
	encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
	past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
	inputs_embeds: Optional[torch.Tensor] = None,
	decoder_inputs_embeds: Optional[torch.Tensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]:
	r"""
	Returns:

	Example:

	```python
	>>> from transformers import AutoTokenizer, T5Model

	>>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
	>>> model = T5Model.from_pretrained("t5-small")

	>>> input_ids = tokenizer(
	... "Studies have been shown that owning a dog is good for you", return_tensors="pt"
	... ).input_ids # Batch size 1
	>>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1

	>>> # preprocess: Prepend decoder_input_ids with start token which is pad token for T5Model.
	>>> # This is not needed for torch's T5ForConditionalGeneration as it does this internally using labels arg.
	>>> decoder_input_ids = model._shift_right(decoder_input_ids)

	>>> # forward pass
	>>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
	>>> last_hidden_states = outputs.last_hidden_state
	```"""
	use_cache = use_cache if use_cache is not None else self.config.use_cache
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	# FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
	if head_mask is not None and decoder_head_mask is None:
	if self.config.num_layers == self.config.num_decoder_layers:
	warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
	decoder_head_mask = head_mask

	# Encode if needed (training, first prediction pass)
	if encoder_outputs is None:
	encoder_outputs = self.encoder(
	input_ids=input_ids,
	attention_mask=attention_mask,
	inputs_embeds=inputs_embeds,
	head_mask=head_mask,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)
	elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
	encoder_outputs = BaseModelOutput(
	last_hidden_state=encoder_outputs[0],
	hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
	attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
	)

	hidden_states = encoder_outputs[0]

	# Set device for model parallelism
	if self.model_parallel:
	torch.cuda.set_device(self.decoder.first_device)
	hidden_states = hidden_states.to(self.decoder.first_device)
	if decoder_input_ids is not None:
	decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
	if attention_mask is not None:
	attention_mask = attention_mask.to(self.decoder.first_device)
	if decoder_attention_mask is not None:
	decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)

	# Decode
	decoder_outputs = self.decoder(
	input_ids=decoder_input_ids,
	attention_mask=decoder_attention_mask,
	inputs_embeds=decoder_inputs_embeds,
	past_key_values=past_key_values,
	encoder_hidden_states=hidden_states,
	encoder_attention_mask=attention_mask,
	head_mask=decoder_head_mask,
	cross_attn_head_mask=cross_attn_head_mask,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)

	if not return_dict:
	return decoder_outputs + encoder_outputs

	return Seq2SeqModelOutput(
	last_hidden_state=decoder_outputs.last_hidden_state,
	past_key_values=decoder_outputs.past_key_values,
	decoder_hidden_states=decoder_outputs.hidden_states,
	decoder_attentions=decoder_outputs.attentions,
	cross_attentions=decoder_outputs.cross_attentions,
	encoder_last_hidden_state=encoder_outputs.last_hidden_state,
	encoder_hidden_states=encoder_outputs.hidden_states,
	encoder_attentions=encoder_outputs.attentions,
	)


	@add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING)
	class T5ForConditionalGeneration(T5PreTrainedModel):
	_keys_to_ignore_on_load_missing = [
	r"encoder.embed_tokens.weight",
	r"decoder.embed_tokens.weight",
	r"lm_head.weight",
	]
	_keys_to_ignore_on_load_unexpected = [
	r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
	]

	def __init__(self, config: T5Config):
	raise NotImplementedError("might need adjustments for GLM")
	super().__init__(config)
	self.model_dim = config.d_model

	self.shared = nn.Embedding(config.vocab_size, config.d_model)

	encoder_config = copy.deepcopy(config)
	encoder_config.is_decoder = False
	encoder_config.use_cache = False
	encoder_config.is_encoder_decoder = False
	self.encoder = T5Stack(encoder_config, self.shared)

	decoder_config = copy.deepcopy(config)
	decoder_config.is_decoder = True
	decoder_config.is_encoder_decoder = False
	decoder_config.num_layers = config.num_decoder_layers
	self.decoder = T5Stack(decoder_config, self.shared)

	self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)

	# Initialize weights and apply final processing
	self.post_init()

	# Model parallel
	self.model_parallel = False
	self.device_map = None

	@add_start_docstrings(PARALLELIZE_DOCSTRING)
	def parallelize(self, device_map=None):
	warnings.warn(
	"`T5ForConditionalGeneration.parallelize` is deprecated and will be removed in v5 of Transformers, you"
	" should load your model with `device_map='balanced'` in the call to `from_pretrained`. You can also"
	" provide your own `device_map` but it needs to be a dictionary module_name to device, so for instance"
	" {'encoder.block.0': 0, 'encoder.block.1': 1, ...}",
	FutureWarning,
	)
	self.device_map = (
	get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
	if device_map is None
	else device_map
	)
	assert_device_map(self.device_map, len(self.encoder.block))
	self.encoder.parallelize(self.device_map)
	self.decoder.parallelize(self.device_map)
	self.lm_head = self.lm_head.to(self.decoder.first_device)
	self.model_parallel = True

	@add_start_docstrings(DEPARALLELIZE_DOCSTRING)
	def deparallelize(self):
	warnings.warn(
	"Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
	FutureWarning,
	)
	self.encoder.deparallelize()
	self.decoder.deparallelize()
	self.encoder = self.encoder.to("cpu")
	self.decoder = self.decoder.to("cpu")
	self.lm_head = self.lm_head.to("cpu")
	self.model_parallel = False
	self.device_map = None
	torch.cuda.empty_cache()

	def get_input_embeddings(self):
	return self.shared

	def set_input_embeddings(self, new_embeddings):
	self.shared = new_embeddings
	self.encoder.set_input_embeddings(new_embeddings)
	self.decoder.set_input_embeddings(new_embeddings)

	def set_output_embeddings(self, new_embeddings):
	self.lm_head = new_embeddings

	def get_output_embeddings(self):
	return self.lm_head

	def get_encoder(self):
	return self.encoder

	def get_decoder(self):
	return self.decoder

	@add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
	@replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.FloatTensor] = None,
	decoder_input_ids: Optional[torch.LongTensor] = None,
	decoder_attention_mask: Optional[torch.BoolTensor] = None,
	head_mask: Optional[torch.FloatTensor] = None,
	decoder_head_mask: Optional[torch.FloatTensor] = None,
	cross_attn_head_mask: Optional[torch.Tensor] = None,
	encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
	past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	relative_position = None, # only for Encoder; CrossAttention does not use PE and SelfAttention in decoder is operating on a sequence
	sparsity_mask = None, # only for Encoder; CrossAttention does not use PE and SelfAttention in decoder is operating on a sequence; However, could be implemented for CrossAttention aswell, to e.g. mask out tokens from relations
	is_concept: Optional[torch.BoolTensor] = None, # Either None, or tensor of shape (batch_size, max_sequence_length). If not None, then the encoder only gets embeddings of concepts but not of relations.
	concept_indices: Optional[list[dict[str,tuple[int,int]]]] = None, # Either None, or list (len=batchsize)of dicts. If not None, then the encoder-embeddings of concepts are aggregated across multiple occurances of a single concept. This can be applied to make the LM baselines work as a graph-encdoer. Then the decoder gets the encoded graph as input, instead of the series of verbalized triplets.
	use_additional_bucket: Optional[torch.BoolTensor] = None,
	) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
	r"""
	labels (`torch.LongTensor` of shape `(batch_size,)`, optional):
	Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
	config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
	labels in `[0, ..., config.vocab_size]`

	Returns:

	Examples:

	```python
	>>> from transformers import AutoTokenizer, T5ForConditionalGeneration

	>>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
	>>> model = T5ForConditionalGeneration.from_pretrained("t5-small")

	>>> # training
	>>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
	>>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
	>>> outputs = model(input_ids=input_ids, labels=labels)
	>>> loss = outputs.loss
	>>> logits = outputs.logits

	>>> # inference
	>>> input_ids = tokenizer(
	... "summarize: studies have shown that owning a dog is good for you", return_tensors="pt"
	... ).input_ids # Batch size 1
	>>> outputs = model.generate(input_ids)
	>>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
	>>> # studies have shown that owning a dog is good for you.
	```"""
	use_cache = use_cache if use_cache is not None else self.config.use_cache
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	# FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
	if head_mask is not None and decoder_head_mask is None:
	if self.config.num_layers == self.config.num_decoder_layers:
	warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
	decoder_head_mask = head_mask

	# Encode if needed (training, first prediction pass)
	if encoder_outputs is None:
	# Convert encoder inputs in embeddings if needed
	encoder_outputs = self.encoder(
	input_ids=input_ids,
	attention_mask=attention_mask,
	inputs_embeds=inputs_embeds,
	head_mask=head_mask,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	relative_position=relative_position,
	sparsity_mask=sparsity_mask,
	use_additional_bucket=use_additional_bucket,
	)
	elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
	encoder_outputs = BaseModelOutput(
	last_hidden_state=encoder_outputs[0],
	hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
	attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
	)

	hidden_states = encoder_outputs[0]

	if self.model_parallel:
	torch.cuda.set_device(self.decoder.first_device)

	if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
	# get decoder inputs from shifting lm labels to the right
	decoder_input_ids = self._shift_right(labels)

	# Set device for model parallelism
	if self.model_parallel:
	torch.cuda.set_device(self.decoder.first_device)
	hidden_states = hidden_states.to(self.decoder.first_device)
	if decoder_input_ids is not None:
	decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
	if attention_mask is not None:
	attention_mask = attention_mask.to(self.decoder.first_device)
	if decoder_attention_mask is not None:
	decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)

	if is_concept is not None: # decoder only attends to concepts
	attention_mask = attention_mask * is_concept

	if concept_indices is not None: # aggregate concept embeddings of encoder
	# print(f'{hidden_states.shape = }')
	# torch.save(hidden_states, 'hidden_states.pt')
	# torch.save(attention_mask, 'attention_mask.pt')
	# torch.save(concept_indices, 'concept_indices.pt')
	# torch.save(is_concept, 'is_concept.pt')
	# assert False
	for b, tmp_batch_concept_indices in enumerate(concept_indices): # iterate over batch
	for tmp_concept_indices in tmp_batch_concept_indices.values(): # iterate over concepts
	tmp_indices = torch.tensor([[i for i in range(*one_concept_occurance)] for one_concept_occurance in tmp_concept_indices], device=hidden_states.device).T # shape (token-length-of-concept, num-occurances-of-concept)

	for tmp_tok_indices in tmp_indices: # iterate over tokens within each concept
	hidden_states[b,tmp_tok_indices] = torch.index_select(hidden_states[b], 0, tmp_tok_indices).mean(dim=0, keepdim=True) # save mean over all occurances

	attention_mask[b, tmp_indices[:,1:]] = False # only attend to first occurace


	# Decode
	decoder_outputs = self.decoder(
	input_ids=decoder_input_ids,
	attention_mask=decoder_attention_mask,
	inputs_embeds=decoder_inputs_embeds,
	past_key_values=past_key_values,
	encoder_hidden_states=hidden_states,
	encoder_attention_mask=attention_mask,
	head_mask=decoder_head_mask,
	cross_attn_head_mask=cross_attn_head_mask,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)

	sequence_output = decoder_outputs[0]

	# Set device for model parallelism
	if self.model_parallel:
	torch.cuda.set_device(self.encoder.first_device)
	self.lm_head = self.lm_head.to(self.encoder.first_device)
	sequence_output = sequence_output.to(self.lm_head.weight.device)

	if self.config.tie_word_embeddings:
	# Rescale output before projecting on vocab
	# See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
	sequence_output = sequence_output * (self.model_dim**-0.5)

	lm_logits = self.lm_head(sequence_output)

	loss = None
	if labels is not None:
	loss_fct = CrossEntropyLoss(ignore_index=-100)
	loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
	# TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666

	if not return_dict:
	output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
	return ((loss,) + output) if loss is not None else output

	return Seq2SeqLMOutput(
	loss=loss,
	logits=lm_logits,
	past_key_values=decoder_outputs.past_key_values,
	decoder_hidden_states=decoder_outputs.hidden_states,
	decoder_attentions=decoder_outputs.attentions,
	cross_attentions=decoder_outputs.cross_attentions,
	encoder_last_hidden_state=encoder_outputs.last_hidden_state,
	encoder_hidden_states=encoder_outputs.hidden_states,
	encoder_attentions=encoder_outputs.attentions,
	)

	def prepare_inputs_for_generation(
	self,
	input_ids,
	past_key_values=None,
	attention_mask=None,
	head_mask=None,
	decoder_head_mask=None,
	cross_attn_head_mask=None,
	use_cache=None,
	encoder_outputs=None,
	**kwargs,
	):
	# cut decoder_input_ids if past is used
	if past_key_values is not None:
	input_ids = input_ids[:, -1:]

	return {
	"decoder_input_ids": input_ids,
	"past_key_values": past_key_values,
	"encoder_outputs": encoder_outputs,
	"attention_mask": attention_mask,
	"head_mask": head_mask,
	"decoder_head_mask": decoder_head_mask,
	"cross_attn_head_mask": cross_attn_head_mask,
	"use_cache": use_cache,
	}

	def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
	return self._shift_right(labels)

	def _reorder_cache(self, past_key_values, beam_idx):
	# if decoder past is not included in output
	# speedy decoding is disabled and no need to reorder
	if past_key_values is None:
	logger.warning("You might want to consider setting `use_cache=True` to speed up decoding")
	return past_key_values

	reordered_decoder_past = ()
	for layer_past_states in past_key_values:
	# get the correct batch idx from layer past batch dim
	# batch dim of `past` is at 2nd position
	reordered_layer_past_states = ()
	for layer_past_state in layer_past_states:
	# need to set correct `past` for each of the four key / value states
	reordered_layer_past_states = reordered_layer_past_states + (
	layer_past_state.index_select(0, beam_idx.to(layer_past_state.device)),
	)

	assert reordered_layer_past_states[0].shape == layer_past_states[0].shape
	assert len(reordered_layer_past_states) == len(layer_past_states)

	reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
	return reordered_decoder_past

	@add_start_docstrings(
	"The bare T5 Model transformer outputting encoder's raw hidden-states without any specific head on top.",
	T5_START_DOCSTRING,
	)
	class T5EncoderModel(T5PreTrainedModel):
	_keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight"]

	def __init__(self, config: T5Config):
	super().__init__(config)
	self.shared = nn.Embedding(config.vocab_size, config.d_model)

	encoder_config = copy.deepcopy(config)
	encoder_config.use_cache = False
	encoder_config.is_encoder_decoder = False
	self.encoder = T5Stack(encoder_config, self.shared)

	# Initialize weights and apply final processing
	self.post_init()

	# Model parallel
	self.model_parallel = False
	self.device_map = None

	@add_start_docstrings(PARALLELIZE_DOCSTRING)
	def parallelize(self, device_map=None):
	warnings.warn(
	"`T5EncoderModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should load"
	" your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
	" `device_map` but it needs to be a dictionary module_name to device, so for instance {'block.0': 0,"
	" 'block.1': 1, ...}",
	FutureWarning,
	)
	self.device_map = (
	get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
	if device_map is None
	else device_map
	)
	assert_device_map(self.device_map, len(self.encoder.block))
	self.encoder.parallelize(self.device_map)
	self.model_parallel = True

	@add_start_docstrings(DEPARALLELIZE_DOCSTRING)
	def deparallelize(self):
	warnings.warn(
	"Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
	FutureWarning,
	)
	self.encoder.deparallelize()
	self.encoder = self.encoder.to("cpu")
	self.model_parallel = False
	self.device_map = None
	torch.cuda.empty_cache()

	def get_input_embeddings(self):
	return self.shared

	def set_input_embeddings(self, new_embeddings):
	self.shared = new_embeddings
	self.encoder.set_input_embeddings(new_embeddings)

	def get_encoder(self):
	return self.encoder

	def _prune_heads(self, heads_to_prune):
	"""
	Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
	class PreTrainedModel
	"""
	for layer, heads in heads_to_prune.items():
	self.encoder.block[layer].layer[0].SelfAttention.prune_heads(heads)

	@add_start_docstrings_to_model_forward(T5_ENCODER_INPUTS_DOCSTRING)
	@replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.FloatTensor] = None,
	head_mask: Optional[torch.FloatTensor] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	relative_position: Optional[torch.LongTensor] = None,
	sparsity_mask: Optional[torch.BoolTensor] = None,
	use_additional_bucket: Optional[torch.BoolTensor] = None,
	) -> Union[Tuple[torch.FloatTensor], BaseModelOutput]:
	r"""
	Returns:

	Example:

	```python
	>>> from transformers import AutoTokenizer, T5EncoderModel

	>>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
	>>> model = T5EncoderModel.from_pretrained("t5-small")
	>>> input_ids = tokenizer(
	... "Studies have been shown that owning a dog is good for you", return_tensors="pt"
	... ).input_ids # Batch size 1
	>>> outputs = model(input_ids=input_ids)
	>>> last_hidden_states = outputs.last_hidden_state
	```"""
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	encoder_outputs = self.encoder(
	input_ids=input_ids,
	attention_mask=attention_mask,
	inputs_embeds=inputs_embeds,
	head_mask=head_mask,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	relative_position=relative_position,
	sparsity_mask=sparsity_mask,
	use_additional_bucket=use_additional_bucket,
	)

	return encoder_outputs

	def init_relative_position_bias(self, modelsize:str, init_decoder:bool=False, init_additional_buckets_from:list[int]=None):
	if init_decoder:
	self.decoder.block[0].layer[0].SelfAttention.init_relative_position_bias(modelsize=modelsize, is_decoder=init_decoder, init_additional_buckets_from=init_additional_buckets_from)
	else:
	self.encoder.block[0].layer[0].SelfAttention.init_relative_position_bias(modelsize=modelsize, is_decoder=init_decoder, init_additional_buckets_from=init_additional_buckets_from)


	class GraphT5Classifier(PreTrainedModel):
	config_class = T5Config

	def __init__(
	self,
	config: T5Config,
	):
	super().__init__(config=config)
	self.config = config
	self.tokenizer = T5Tokenizer.from_pretrained(self.config.modelsize, model_max_length=self.config.model_max_length)

	self.t5model = T5EncoderModel.from_pretrained(self.config.modelsize, config=config, ignore_mismatched_sizes=True) # when intialiting the model with .from_pretrained, the weights are loaded from the pretrained model, so the t5 parameters are not actually used in that case. Loading them here is unnecessary overhead.
	self.hidden_size = self.t5model.config.d_model
	self.classification_head = nn.Linear(self.hidden_size, self.config.num_classes, bias=True)
	self.softmax = nn.Softmax(dim=-1)

	@staticmethod
	def get_config(num_classes:int, modelsize:str="t5-base", num_additional_buckets:int=0, model_max_length:int=512) -> T5Config:
	config = T5Config.from_pretrained(modelsize)
	config.num_classes = int(num_classes)
	config.modelsize = str(modelsize)
	config.relative_attention_num_additional_buckets = int(num_additional_buckets)
	config.model_max_length = int(model_max_length)
	return config

	def forward(
	self,
	input_ids: torch.Tensor,
	relative_position: torch.Tensor,
	sparsity_mask: torch.Tensor,
	use_additional_bucket: torch.Tensor,
	) -> torch.Tensor:
	logging.debug('t5 encoder model')
	output = self.t5model(input_ids=input_ids, relative_position=relative_position, sparsity_mask=sparsity_mask, use_additional_bucket=use_additional_bucket) # (batch_size, seq_len, hidden_size)
	logging.debug('classification head')
	logits = self.classification_head(output[0]) # (batch_size, seq_len, num_classes)

	return logits

	def get_probabilities(self, logits: torch.Tensor) -> torch.Tensor:
	return self.softmax(logits) # (batch_size, seq_len, num_classes)

	def get_label(self, logits: torch.Tensor) -> torch.Tensor:
	return torch.argmax(logits, dim=-1)


	class DualGraphT5Classifier(PreTrainedModel):
	"""
	Same as GraphT5Classifier, but with two classification heads
	"""
	config_class = T5Config

	def __init__(
	self,
	config: T5Config,
	):
	super().__init__(config=config)
	self.config = config
	self.tokenizer = T5Tokenizer.from_pretrained(self.config.modelsize, model_max_length=self.config.model_max_length)

	self.t5model = T5EncoderModel.from_pretrained(self.config.modelsize, config=config, ignore_mismatched_sizes=True)
	self.hidden_size = self.t5model.config.d_model
	self.classification_head1 = nn.Linear(self.hidden_size, self.config.num_classes1, bias=True)
	self.classification_head2 = nn.Linear(self.hidden_size, self.config.num_classes2, bias=True)
	self.softmax = nn.Softmax(dim=-1)

	@staticmethod
	def get_config(num_classes1:int, num_classes2:int, modelsize:str="t5-base", num_additional_buckets:int=0, model_max_length:int=512) -> T5Config:
	config = T5Config.from_pretrained(modelsize)
	config.num_classes1 = int(num_classes1)
	config.num_classes2 = int(num_classes2)
	config.modelsize = str(modelsize)
	config.relative_attention_num_additional_buckets = int(num_additional_buckets)
	config.model_max_length = int(model_max_length)
	return config

	def forward(
	self,
	input_ids: torch.Tensor,
	relative_position: torch.Tensor,
	sparsity_mask: torch.Tensor,
	use_additional_bucket: torch.Tensor,
	) -> torch.Tensor:
	logging.debug('t5 encoder model')
	output = self.t5model(input_ids=input_ids, relative_position=relative_position, sparsity_mask=sparsity_mask, use_additional_bucket=use_additional_bucket) # (batch_size, seq_len, hidden_size)
	logging.debug('classification head 1')
	logits1 = self.classification_head1(output[0]) # (batch_size, seq_len, num_classes1)
	logging.debug('classification head 2')
	logits2 = self.classification_head2(output[0]) # (batch_size, seq_len, num_classes2)

	return logits1, logits2

	def get_probabilities(self, logits: torch.Tensor) -> torch.Tensor:
	return self.softmax(logits) # (batch_size, seq_len, num_classes)

	def get_label(self, logits: torch.Tensor) -> torch.Tensor:
	return torch.argmax(logits, dim=-1)