test-flex-gpt / rotary.py

oweller2

added in file

204da06 17 days ago

12 kB

	# Copyright 2024 AUTHORS_TODO
	# License: Apache-2.0

	# Copyright (c) 2023, Tri Dao.
	# License: Apache-2.0

	import torch
	from einops import rearrange
	from flash_attn.ops.triton.rotary import apply_rotary

	from typing import Optional, Tuple, Union


	class ApplyRotaryEmbUnpad(torch.autograd.Function):
	@staticmethod
	def forward(
	ctx,
	qkv,
	cos,
	sin,
	interleaved=False,
	seqlen_offsets: Union[int, torch.Tensor] = 0,
	cu_seqlens: Optional[torch.Tensor] = None,
	max_seqlen: Optional[int] = None,
	):
	# (total_nnz, 3, nheads, headdim)
	total_nnz, three, nheads, headdim = qkv.shape
	assert three == 3
	if qkv.is_contiguous():
	# Call 1 kernel instead of 2 kernels
	# We need qkv to be contiguous so that when we reshape to combine (3, nheads)
	# dimensions, we get the same tensor
	# qk = rearrange(qkv[:, :2], "b_s t h d -> b_s (t h) d")
	qk = qkv[:, :2].view(total_nnz, -1, headdim)
	apply_rotary(
	qk,
	cos,
	sin,
	seqlen_offsets=seqlen_offsets,
	cu_seqlens=cu_seqlens,
	max_seqlen=max_seqlen,
	interleaved=interleaved,
	inplace=True,
	)
	else:
	q, k = qkv[:, 0, :, :], qkv[:, 1, :, :]
	apply_rotary(
	q,
	cos,
	sin,
	seqlen_offsets=seqlen_offsets,
	cu_seqlens=cu_seqlens,
	max_seqlen=max_seqlen,
	interleaved=interleaved,
	inplace=True,
	)
	apply_rotary(
	k,
	cos,
	sin,
	seqlen_offsets=seqlen_offsets,
	cu_seqlens=cu_seqlens,
	max_seqlen=max_seqlen,
	interleaved=interleaved,
	inplace=True,
	)

	if isinstance(seqlen_offsets, int):
	ctx.save_for_backward(cos, sin, cu_seqlens)
	ctx.seqlen_offsets = seqlen_offsets
	else:
	ctx.save_for_backward(cos, sin, cu_seqlens, seqlen_offsets)
	ctx.seqlen_offsets = None
	ctx.interleaved = interleaved
	ctx.max_seqlen = max_seqlen
	return qkv

	@staticmethod
	def backward(ctx, do):
	seqlen_offsets = ctx.seqlen_offsets
	if seqlen_offsets is None:
	cos, sin, cu_seqlens, seqlen_offsets = ctx.saved_tensors
	else:
	cos, sin, cu_seqlens = ctx.saved_tensors
	if do.is_contiguous():
	total_nnz, three, nheads, headdim = do.shape
	# Call 1 kernel instead of 2 kernels
	# We need dqkv to be contiguous so that when we reshape to combine (3, nheads)
	# dimensions, we get the same tensor
	dqk = do[:, :2].view(total_nnz, -1, headdim)
	apply_rotary(
	dqk,
	cos,
	sin,
	seqlen_offsets=seqlen_offsets,
	cu_seqlens=cu_seqlens,
	max_seqlen=ctx.max_seqlen,
	interleaved=ctx.interleaved,
	inplace=True,
	conjugate=True,
	)
	else:
	dq, dk = do[:, 0, :, :], do[:, 1, :, :]
	apply_rotary(
	dq,
	cos,
	sin,
	seqlen_offsets=seqlen_offsets,
	cu_seqlens=cu_seqlens,
	max_seqlen=ctx.max_seqlen,
	interleaved=ctx.interleaved,
	inplace=True,
	conjugate=True,
	)
	apply_rotary(
	dk,
	cos,
	sin,
	seqlen_offsets=seqlen_offsets,
	cu_seqlens=cu_seqlens,
	max_seqlen=ctx.max_seqlen,
	interleaved=ctx.interleaved,
	inplace=True,
	conjugate=True,
	)

	return do, None, None, None, None, None, None


	def apply_rotary_emb_unpad(
	qkv,
	cos,
	sin,
	interleaved=False,
	seqlen_offsets: Union[int, torch.Tensor] = 0,
	cu_seqlens: Optional[torch.Tensor] = None,
	max_seqlen: Optional[int] = None,
	):
	"""
	Arguments:
	qkv: (total_nnz, 3, nheads, headdim) - input tensor for packed QKV.
	cos, sin: (seqlen_rotary, rotary_dim / 2)
	interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
	of 1st half and 2nd half (GPT-NeoX style).
	inplace: if True, apply rotary embedding in-place.
	seqlen_offsets: (batch_size,) or int. Each sequence in x is shifted by this amount.
	Most commonly used in inference when we have KV cache.
	cu_seqlens: (batch + 1,) or None
	max_seqlen: int
	Return:
	out: (total_nnz, dim)
	rotary_dim must be <= headdim
	Apply rotary embedding to the first rotary_dim of x.
	"""
	return ApplyRotaryEmbUnpad.apply(qkv, cos, sin, interleaved, seqlen_offsets, cu_seqlens, max_seqlen)


	class UnpaddedRotaryEmbedding(torch.nn.Module):
	"""
	The rotary position embeddings applied directly to unpadded sequences.
	"""

	def __init__(
	self,
	dim: int,
	base: float = 10000.0,
	interleaved: bool = False,
	max_seqlen: Optional[int] = None,
	scale_base: Optional[bool] = None,
	pos_idx_in_fp32: bool = True,
	device: Optional[torch.device] = None,
	dtype: Optional[torch.dtype] = None,
	):
	"""
	interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
	of 1st half and 2nd half (GPT-NeoX style).
	pos_idx_in_fp32: if True, the position indices [0.0, ..., seqlen - 1] are in fp32,
	otherwise they might be in lower precision.
	This option was added because previously (before 2023-07-02), when we construct
	the position indices, we use the dtype of self.inv_freq. In most cases this would
	be fp32, but if the model is trained in pure bf16 (not mixed precision), then
	self.inv_freq would be bf16, and the position indices are also in bf16.
	Because of the limited precision of bf16 (e.g. 1995.0 is rounded to 2000.0), the
	embeddings for some positions will coincide.
	To maintain compatibility with models previously trained in pure bf16,
	we add this option.
	max_seqlen: if max_seqlen, device, and dtype are provided, we precompute the cos_sin_cache
	up to max_seqlen. If the max_seqlen, device, or dtype during training/inference differ,
	the cos_sin_cache wll be recomputed during the forward pass.
	"""
	super().__init__()
	self.dim = dim
	self.base = float(base)
	self.pos_idx_in_fp32 = pos_idx_in_fp32
	# Generate and save the inverse frequency buffer (non trainable)
	inv_freq = self._compute_inv_freq(device)
	self.register_buffer("inv_freq", inv_freq, persistent=False)
	self.interleaved = interleaved
	self.scale_base = scale_base
	scale = (
	(torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim)
	if scale_base is not None
	else None
	)
	self.register_buffer("scale", scale, persistent=False)

	self._seq_len_cached = 0
	self._cos_cached = None
	self._sin_cached = None
	self._cos_k_cached = None
	self._sin_k_cached = None

	if max_seqlen is not None and device is not None and dtype is not None:
	self._update_cos_sin_cache(max_seqlen, device=device, dtype=dtype)

	def _compute_inv_freq(self, device=None):
	return 1.0 / (self.base ** (torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim))

	def _update_cos_sin_cache(self, seqlen, device=None, dtype=None):
	# Reset the tables if the sequence length has changed,
	# if we're on a new device (possibly due to tracing for instance),
	# or if we're switching from inference mode to training
	if (
	seqlen > self._seq_len_cached
	or self._cos_cached is None
	or self._cos_cached.device != device
	or self._cos_cached.dtype != dtype
	or (self.training and self._cos_cached.is_inference())
	):
	self._seq_len_cached = seqlen
	# We want fp32 here, not self.inv_freq.dtype, since the model could be loaded in bf16
	# And the output of arange can be quite large, so bf16 would lose a lot of precision.
	# However, for compatibility reason, we add an option to use the dtype of self.inv_freq.
	if self.pos_idx_in_fp32:
	t = torch.arange(seqlen, device=device, dtype=torch.float32)
	# We want fp32 here as well since inv_freq will be multiplied with t, and the output
	# will be large. Having it in bf16 will lose a lot of precision and cause the
	# cos & sin output to change significantly.
	# We want to recompute self.inv_freq if it was not loaded in fp32
	if self.inv_freq.dtype != torch.float32:
	inv_freq = self._compute_inv_freq(device=device)
	else:
	inv_freq = self.inv_freq
	else:
	t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
	inv_freq = self.inv_freq
	# Don't do einsum, it converts fp32 to fp16 under AMP
	# freqs = torch.einsum("i,j->ij", t, self.inv_freq)
	freqs = torch.outer(t, inv_freq)
	if self.scale is None:
	self._cos_cached = torch.cos(freqs).to(dtype)
	self._sin_cached = torch.sin(freqs).to(dtype)
	else:
	power = (
	torch.arange(seqlen, dtype=self.scale.dtype, device=self.scale.device) - seqlen // 2
	) / self.scale_base
	scale = self.scale.to(device=power.device) ** rearrange(power, "s -> s 1")
	# We want the multiplication by scale to happen in fp32
	self._cos_cached = (torch.cos(freqs) * scale).to(dtype)
	self._sin_cached = (torch.sin(freqs) * scale).to(dtype)
	self._cos_k_cached = (torch.cos(freqs) / scale).to(dtype)
	self._sin_k_cached = (torch.sin(freqs) / scale).to(dtype)

	def forward(
	self,
	qkv: torch.Tensor,
	cu_seqlens: torch.Tensor,
	max_seqlen: Optional[int] = None,
	seqlen_offset: Union[int, torch.Tensor] = 0,
	) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
	"""
	qkv: (total_nnz, 3, nheads, headdim)
	cu_seqlens: (batch + 1,) cumulative sequence lengths
	max_seqlen: int max seq length in the batch
	seqlen_offset: (batch_size,) or int. Each sequence in x is shifted by this amount.
	Most commonly used in inference when we have KV cache.
	If it's a tensor of shape (batch_size,), then to update the cos / sin cache, one
	should pass in max_seqlen, which will update the cos / sin cache up to that length.
	Apply rotary embedding inplace to qkv.
	"""
	if max_seqlen is not None:
	self._update_cos_sin_cache(max_seqlen, device=qkv.device, dtype=qkv.dtype)

	qkv = apply_rotary_emb_unpad(
	qkv,
	self._cos_cached,
	self._sin_cached,
	interleaved=self.interleaved,
	seqlen_offsets=seqlen_offset,
	cu_seqlens=cu_seqlens,
	max_seqlen=max_seqlen,
	)

	return qkv

	def extra_repr(self) -> str:
	return f"dim={self.dim}, base={self.base}, scale_base={self.scale_base}"