hertz-dev

Paused

hertz-dev / transformer.py

calculating

committing...

824afbf 22 days ago

12.2 kB

	from typing import Optional, Tuple, MutableMapping
	from typing import Union
	import math
	from contextlib import nullcontext

	import torch
	import torch as T
	import torch.nn as nn
	import torch.nn.functional as F
	from torch import Tensor
	from torch.nn.attention import SDPBackend

	from einops import rearrange

	from utils import si_module, default, exists, load_ckpt

	CACHE_FILL_VALUE = -1

	def get_cache_len(cache: Optional[Tensor]) -> int:
	"""
	cache: (batch, seq_len, 2, kv_heads, head_dim)
	"""
	if cache is None:
	return 0
	nonzeros = T.any(cache.flatten(2) != CACHE_FILL_VALUE, dim=-1)
	length = nonzeros.sum(dim=-1).int()
	assert T.all(length == length[0])
	return length[0]


	def rotate_half(x):
	x1, x2 = x.chunk(2, dim=-1)
	return torch.cat((-x2, x1), dim=-1)


	def apply_rotary_pos_emb(x, cos, sin, offset: int = 0):
	assert (
	cos.shape[1] >= offset + x.shape[1]
	), f"Offset and/or input sequence is too large,\
	\n offset: {offset}, seq_len: {x.shape[1]}, max: {cos.shape[1]}"

	cos_out = cos[:, offset : offset + x.shape[1], :, :]
	sin_out = sin[:, offset : offset + x.shape[1], :, :]

	return (x * cos_out) + (rotate_half(x) * sin_out)


	# Adapted from https://github.com/foundation-model-stack/foundation-model-stack
	class ShapeRotator:
	def __init__(
	self,
	dim: int,
	end: int,
	theta: float = 10_000,
	):
	super().__init__()
	self.dim = dim
	self.ratio = theta
	self.cached_freqs: MutableMapping[int, MutableMapping[int, torch.Tensor]] = {}
	self.max_seq_len_cached: MutableMapping[int, int] = {}
	self.ntk_scaling = False
	self.max_seq_len = end

	def compute_freqs_cis(self, device, max_seq_len=None):
	alpha = 1
	dev_idx = device.index
	max_seq_len = default(max_seq_len, self.max_seq_len)

	if dev_idx not in self.cached_freqs:
	self.cached_freqs[dev_idx] = {}
	if dev_idx not in self.max_seq_len_cached:
	self.max_seq_len_cached[dev_idx] = 0


	if self.max_seq_len_cached[dev_idx] > 0:
	return 1
	max_seq_len = max(max_seq_len, self.max_seq_len)

	if (
	1 in self.cached_freqs[dev_idx]
	and max_seq_len <= self.max_seq_len_cached[dev_idx]
	):
	return 1

	ratio = self.ratio
	dim = self.dim

	freqs = 1.0 / (ratio ** (torch.arange(0, dim, 2, device=device).float() / dim))

	t = torch.arange(max_seq_len, device=device, dtype=freqs.dtype)
	freqs = torch.einsum("i,j->ij", t, freqs)
	emb = torch.cat((freqs, freqs), dim=-1).to(device)

	cos_to_cache = emb.cos()[None, :, None, :]
	sin_to_cache = emb.sin()[None, :, None, :]

	self.max_seq_len_cached[dev_idx] = max_seq_len

	self.cached_freqs[dev_idx][alpha] = torch.stack(
	[
	cos_to_cache,
	sin_to_cache,
	],
	dim=-1,
	)

	return alpha

	def rotate(
	self,
	q: Tensor,
	k: Tensor,
	offset: int = 0,
	) -> Tuple[Tensor, Tensor]:
	"""
	Args
	----
	q : torch.Tensor
	Embedded query tensor, expected size is B x S x H x Eh
	k : torch.Tensor
	Embedded query tensor, expected size is B x S x H x Eh
	"""
	assert len(q.size()) == 4
	assert len(k.size()) == 4

	seq_len = self.max_seq_len
	alpha = self.compute_freqs_cis(q.device, seq_len)
	freqs = self.cached_freqs[q.device.index][alpha]

	freqs = freqs.float() # 1 L D/2 2 2
	q_out = apply_rotary_pos_emb(q, freqs[..., 0], freqs[..., 1], offset=offset).type_as(q)
	k_out = apply_rotary_pos_emb(k, freqs[..., 0], freqs[..., 1], offset=offset).type_as(k)

	return q_out.view_as(q), k_out.view_as(k)

	class Linear(nn.Linear):
	def __init__(self, args, *kwargs):
	super().__init__(args, *kwargs, bias=False)

	class Norm(nn.Module):
	def __init__(self,
	dim: int,
	eps: float = 1e-5,) -> None:
	super().__init__()
	self.eps = eps
	self.weight = nn.Parameter(T.ones((dim,)))

	def forward(self, input: Tensor) -> Tensor:
	return F.layer_norm(input, (self.weight.shape[0],), weight=self.weight, bias=None, eps=self.eps)


	class FFNN(nn.Module):
	def __init__(self,
	dim: int,
	expand_dim: int = None,):
	super().__init__()
	expand_dim = default(expand_dim, 256 * ((int(2 * 4 * dim / 3) + 256 - 1) // 256))
	self.dim = dim
	self.expand_dim = expand_dim

	self.gateup_proj = Linear(dim, 2*expand_dim)
	self.down_proj = Linear(expand_dim, dim)

	def forward(self, x):
	gate, up = self.gateup_proj(x).chunk(2, dim=-1)
	return self.down_proj(up * F.silu(gate))

	class GQA(nn.Module):
	def __init__(self,
	dim: int,
	n_head: int,
	shape_rotator: ShapeRotator,
	kv_heads: Optional[int] = None,
	eps: float = 1e-5,
	causal: bool = True,):
	super().__init__()
	self.n_heads = n_head
	self.kv_heads = default(kv_heads, n_head)
	self.head_dim = dim // n_head
	self.causal = causal

	self.proj_qkv = Linear(dim, self.head_dim(n_head+2self.kv_heads))

	self.norm_q = Norm(self.head_dim*n_head, eps=eps)
	self.norm_k = Norm(self.head_dim*self.kv_heads, eps=eps)

	self.attn_out = Linear(dim, dim)

	self.shape_rotator = shape_rotator

	def _sdpa(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
	k = k.repeat_interleave(self.n_heads // self.kv_heads, dim=2)
	v = v.repeat_interleave(self.n_heads // self.kv_heads, dim=2)
	with nn.attention.sdpa_kernel(SDPBackend.FLASH_ATTENTION) if k.device.type == 'cuda' else nullcontext():
	x = F.scaled_dot_product_attention(
	q.transpose(1, 2),
	k.transpose(1, 2),
	v.transpose(1, 2),
	is_causal=False if (q.size(1) != k.size(1)) else self.causal,
	)
	x = x.transpose(1, 2).contiguous()
	return x

	def _attend(self, q: Tensor, k: Tensor, v: Tensor, kv_cache: Optional[Tensor] = None,):
	cache_len = get_cache_len(kv_cache)
	q, k = self.shape_rotator.rotate(q, k, offset=cache_len)
	if exists(kv_cache):
	k = T.cat([kv_cache[:, :cache_len, 0], k], dim=1)
	v = T.cat([kv_cache[:, :cache_len, 1], v], dim=1)
	kv_cache[:, :k.size(1), 0] = k
	kv_cache[:, :v.size(1), 1] = v
	x = self._sdpa(q, k, v)
	return self.attn_out(rearrange(x, 'b s h d -> b s (h d)'))

	def _project(self, x):
	full_q, full_k, full_v = self.proj_qkv(x).chunk(3, dim=-1)
	normed_full_q = self.norm_q(full_q).to(full_q.dtype)
	normed_full_k = self.norm_k(full_k).to(full_k.dtype)

	q = rearrange(normed_full_q, 'b s (h d) -> b s h d', h=self.n_heads)
	k = rearrange(normed_full_k, 'b s (h d) -> b s h d', h=self.kv_heads)
	v = rearrange(full_v, 'b s (h d) -> b s h d', h=self.kv_heads)
	return q, k, v

	def forward(self,
	x: Tensor,
	kv: Optional[Tensor] = None,):
	"""
	x: (B, S, D)
	kv: (B, S, H, D)
	"""
	q, k, v = self._project(x)
	return self._attend(q, k, v, kv_cache=kv)


	class PreNormAttn(nn.Module):
	def __init__(self,
	dim: int,
	n_head: int,
	shape_rotator: ShapeRotator,
	kv_heads: Optional[int] = None,
	eps: float = 1e-5,
	causal: bool = True,):
	super().__init__()
	self.attn_norm = Norm(dim, eps=eps)
	self.attn = GQA(dim, n_head, shape_rotator, kv_heads, eps=eps, causal=causal)

	def forward(self, x: Tensor, kv: Optional[Tensor] = None) -> Tensor:
	"""
	x: (B, S, D)
	kv: (B, S, H, D)
	"""
	return x + self.attn(self.attn_norm(x), kv)

	class PreNormFFNN(nn.Module):
	def __init__(self,
	dim: int,
	ff_dim: int,
	eps: float = 1e-5,):
	super().__init__()
	self.ffnn_norm = Norm(dim, eps=eps)
	self.ffnn = FFNN(dim, ff_dim)

	def forward(self, x: Tensor) -> Tensor:
	return x + self.ffnn(self.ffnn_norm(x))

	class Block(nn.Module):
	def __init__(self,
	dim: int,
	layer_id: int = 0,
	n_head: int = 16,
	kv_heads: Optional[int] = None,
	ff_dim: Optional[int] = None,
	eps: float = 1e-5,
	causal: bool = True,
	shape_rotator: ShapeRotator = None):
	super().__init__()
	self.attn = PreNormAttn(dim, n_head, shape_rotator, kv_heads, eps=eps, causal=causal)
	self.ffnn = PreNormFFNN(dim, ff_dim, eps=eps)
	self.dim = dim
	self.layer_id = layer_id
	self.head_dim = dim // n_head
	self.expand_dim = self.ffnn.ffnn.expand_dim

	self.reset_parameters()

	def reset_parameters(self):
	std = 1.0 / math.sqrt(self.dim)
	nn.init.trunc_normal_(self.ffnn.ffnn.gateup_proj.weight, std=std, a=-3 * std, b=3 * std)
	nn.init.trunc_normal_(self.attn.attn.proj_qkv.weight, std=std, a=-3 * std, b=3 * std)
	nn.init.trunc_normal_(self.attn.attn.attn_out.weight, std=std, a=-3 * std, b=3 * std)

	xstd = 1.0 / math.sqrt(self.expand_dim)
	nn.init.trunc_normal_(self.ffnn.ffnn.down_proj.weight, std=xstd, a=-3 * xstd, b=3 * xstd)

	def forward(self, x: Tensor, kv: Optional[Tensor] = None) -> Tensor:
	"""
	x: (B, S, D)
	kv: (B, S, H, D)
	"""
	h = self.attn(x, kv)
	out = self.ffnn(h)
	return out



	class GPTOutput(nn.Module):
	def __init__(self, dim, vocab_size):
	super().__init__()
	self.dim = dim
	self.norm = Norm(dim)
	self.output = Linear(dim, vocab_size)

	self.reset_parameters()

	def reset_parameters(self):
	std = 1.0 / math.sqrt(self.dim**2)
	nn.init.trunc_normal_(self.output.weight, std=std, a=-3 * std, b=3 * std)

	def forward(self, x):
	return self.output(self.norm(x))

	@si_module
	class Stack(nn.Module):
	class Config:
	layers: int
	dim: int
	seq_len: int
	n_head: int = 32
	ff_dim: int = None
	kv_heads: int = None
	eps: float = 1e-5
	theta: Union[int, float] = 10_000
	causal: bool = True

	from_pretrained: Optional[Tuple[str, int]] = None

	def __init__(self, c: Config):
	super().__init__()

	from_pretrained = c.from_pretrained
	if exists(from_pretrained):
	checkpoint = load_ckpt(c.from_pretrained)

	self.shape_rotator = ShapeRotator(c.dim//c.n_head, c.seq_len, theta=c.theta)

	self.layers = nn.ModuleList([
	Block(
	dim=c.dim,
	layer_id=l,
	n_head=c.n_head,
	kv_heads=c.kv_heads,
	ff_dim=c.ff_dim,
	eps=c.eps,
	causal=c.causal,
	shape_rotator=self.shape_rotator,
	) for l in range(c.layers)
	])

	kv_heads = c.kv_heads or c.n_head
	head_dim = c.dim // c.n_head
	cache_shape = [c.layers, c.seq_len, 2, kv_heads, head_dim]
	self.cache_shape = cache_shape
	self.cache = [None] * c.layers

	if exists(from_pretrained):
	self.load_state_dict(checkpoint)

	def init_cache(self, bsize, device, dtype, length:int=None):
	if self.cache_shape is None:
	return
	cache_shape = self.cache_shape.copy()
	cache_shape[1] = length or cache_shape[1]
	self.cache = T.full((bsize, *cache_shape), CACHE_FILL_VALUE, device=device, dtype=dtype).transpose(0, 1)

	def deinit_cache(self):
	self.cache = [None] * len(self.cache)

	def forward(self, x: Tensor) -> Tensor:
	for l, layer in enumerate(self.layers):
	x = layer(x, kv=self.cache[l])
	return x