cde-small-v1 / model.py
jxm's picture
update model.py to work
6b3423f verified
from typing import Callable, Dict, Optional, Union, Tuple
import copy
import math
import multiprocessing
import os
import torch
import torch.nn as nn
import transformers
from .misc import ContextualModelConfig
def load_embedder_and_tokenizer(name: str) -> Tuple[
transformers.PreTrainedModel,
transformers.PreTrainedTokenizer
]:
if name.startswith("nomic") or (name == "bert-base-uncased"):
model = transformers.AutoModelForMaskedLM.from_pretrained(name, trust_remote_code=True).bert
tokenizer = transformers.AutoTokenizer.from_pretrained(name)
elif name in ["gtr-base", "gtr_base"]:
model = transformers.AutoModel.from_pretrained(
"sentence-transformers/gtr-t5-base"
).encoder
tokenizer = transformers.AutoTokenizer.from_pretrained(
"sentence-transformers/gtr-t5-base"
)
elif name == "pile-t5-base-encoder":
model = transformers.AutoModel.from_pretrained(
"EleutherAI/pile-t5-base"
).encoder
tokenizer = transformers.AutoTokenizer.from_pretrained(
"EleutherAI/pile-t5-base"
)
tokenizer.pad_token = tokenizer.eos_token
elif name == "pile-t5-base-decoder":
model = transformers.AutoModel.from_pretrained(
"EleutherAI/pile-t5-base"
).decoder
tokenizer = transformers.AutoTokenizer.from_pretrained(
"EleutherAI/pile-t5-base"
)
tokenizer.pad_token = tokenizer.eos_token
elif name.startswith("gpt2") or name.startswith("meta-llama") or ("Llama" in name):
model = transformers.AutoModelForCausalLM.from_pretrained(
name,
# torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
low_cpu_mem_usage=True,
# device_map="auto",
)
model.padding_side = "right"
tokenizer = transformers.AutoTokenizer.from_pretrained(name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
else:
model = transformers.AutoModel.from_pretrained(name, trust_remote_code=True)
tokenizer = transformers.AutoTokenizer.from_pretrained(name)
# if use_bettertransformer:
# from optimum.bettertransformer import BetterTransformer
# model = BetterTransformer.transform(model)
return model, tokenizer
def get_world_size() -> int:
try:
return torch.distributed.get_world_size()
except (RuntimeError, ValueError):
return 1
def get_rank() -> int:
try:
return torch.distributed.get_rank()
except (RuntimeError, ValueError):
return 0
def gather(t: torch.Tensor) -> torch.Tensor:
# torch.distributed.nn.all_gather scales by world size since the reduce op is SUM
# https://github.com/pytorch/pytorch/issues/58005
# only should use torch.distributed.nn.all_gather if we implement a `local_loss`
# like: https://github.com/mlfoundations/open_clip/issues/616
world_size = get_world_size()
if world_size == 1:
return t
if t.ndim == 0:
t = t.unsqueeze(0)
gathered = [torch.empty_like(t) for _ in range(world_size)]
torch.distributed.all_gather(gathered, t)
gathered[get_rank()] = t
return torch.cat(gathered, dim=0)
def gather_sum(t: torch.Tensor) -> torch.Tensor:
# torch.distributed.nn.all_gather scales by world size since the reduce op is SUM
# https://github.com/pytorch/pytorch/issues/58005
# only should use torch.distributed.nn.all_gather if we implement a `local_loss`
# like: https://github.com/mlfoundations/open_clip/issues/616
world_size = get_world_size()
if world_size == 1:
return t
if t.ndim == 0:
t = t.unsqueeze(0)
gathered = [torch.empty_like(t) for _ in range(world_size)]
torch.distributed.all_gather(gathered, t)
gathered = torch.stack(gathered, dim=0)
return gathered.sum(dim=0) # Sum across workers
def get_num_proc() -> int:
world_size: int = get_world_size()
try:
# os.sched_getaffinity respects schedulers, unlike cpu_count(), but it's only available
# on some Unix platforms, so we support both!
return len(os.sched_getaffinity(0)) // world_size # type: ignore[attr-defined]
except AttributeError:
return multiprocessing.cpu_count() // world_size
def torch_main_worker_finish_first(func: Callable):
def wrapper(*args, **kwargs):
# Get local rank (need to support non-DDP).
try:
local_rank = torch.distributed.get_rank()
ddp_enabled = True
except (RuntimeError, ValueError):
local_rank = -1
ddp_enabled = False
is_main_worker = local_rank <= 0
# Run on main worker first.
if is_main_worker:
result = func(*args, **kwargs)
# Then everyone waits.
if ddp_enabled:
torch.distributed.barrier()
# Run on other workers now.
if not is_main_worker:
result = func(*args, **kwargs)
# Now everyone waits again.
if ddp_enabled:
torch.distributed.barrier()
return result
return wrapper
def print0(*args, **kwargs) -> None:
if get_rank() == 0:
print(*args, **kwargs)
def verify_ddp_weights_equal(model: torch.nn.Module, atol: float = 1e-5) -> None:
if hasattr(model, "module"):
model = model.module
world_size = get_world_size()
if world_size > 8:
print0(f"[verify_ddp_weights_equal] Skipping with world_size={world_size} ⚠️")
return
for name, param in model.named_parameters():
if param is None: continue
if param.grad is None:
print0(f"[verify_ddp_weights_equal] Skipping param [{name}] with no grad")
continue
gathered_param = gather(param).reshape((world_size, -1))
absolute_diffs = (gathered_param[None, 0, :] - gathered_param).abs()
rank_params_eq = (absolute_diffs < atol).all()
assert rank_params_eq, f"❌ param [{name}] not equal - got max_absolute_diff={absolute_diffs.max()}"
###################################################################################################################
gathered_param_grad = gather(param.grad).reshape((world_size, -1))
absolute_grad_diffs = (gathered_param_grad[None, 0, :] - gathered_param_grad).abs()
rank_grad_params_eq = (absolute_grad_diffs < atol).all()
assert rank_grad_params_eq, f"❌ param [{name}] grad not equal - got max_absolute_diff={absolute_grad_diffs.max()}"
###################################################################################################################
print0("[verify_ddp_weights_equal] Verified DDP parameter correctness ✅")
def mean_pool_3d(
hidden_states: torch.Tensor, attention_mask: torch.Tensor
) -> torch.Tensor:
B, T, S, D = hidden_states.shape
unmasked_outputs = hidden_states * attention_mask[..., None]
pooled_outputs = unmasked_outputs.sum(dim=2) / (attention_mask.sum(dim=2)[..., None] + 1e-9)
# fix for gradient flow: fill empty rows with the mean of the rest of the sequence
sequence_means = (
hidden_states.reshape((B, S * T, D))
.mean(dim=1, keepdim=True)
.expand(-1, T, -1)
)
pooled_outputs = pooled_outputs.where(
(attention_mask.sum(dim=2)[..., None] > 0),
sequence_means
)
assert pooled_outputs.shape == (B, T, D)
return pooled_outputs
def mean_pool(
hidden_states: torch.Tensor, attention_mask: torch.Tensor
) -> torch.Tensor:
B, _S, D = hidden_states.shape
unmasked_outputs = hidden_states * attention_mask[..., None]
pooled_outputs = unmasked_outputs.sum(dim=1) / (attention_mask.sum(dim=1)[:, None] + 1e-20)
assert pooled_outputs.shape == (B, D)
return pooled_outputs
def mean_pool_weighted(
hidden_states: torch.Tensor, attention_mask: torch.Tensor
) -> torch.Tensor:
B, _S, D = hidden_states.shape
attention_mask *= attention_mask.cumsum(dim=1) # [0,1,1,1,0,0] -> [0,1,2,3,0,0]
s = torch.sum(hidden_states * attention_mask.unsqueeze(-1).float(), dim=1)
d = attention_mask.sum(dim=1, keepdim=True).float()
return s / d
def slice_sparse_tensor_rows(t: torch.sparse.Tensor, min_row: int, max_row: int) -> torch.sparse.Tensor:
assert min_row < max_row, f"can't slice from row {min_row} to {max_row}"
t = t.coalesce()
row_idxs = t.indices()[0]
index_mask = (min_row <= row_idxs) & (row_idxs < max_row)
num_rows = (max_row - min_row)
num_cols = t.shape[1]
idxs = t.indices()[:, index_mask]
vals = t.values()[index_mask]
return torch.sparse_coo_tensor(idxs, vals, size=(num_rows, num_cols)).coalesce()
def slice_tensor_rows(t: torch.Tensor, min_row: int, max_row: int) -> torch.Tensor:
if t.is_sparse:
return slice_sparse_tensor_rows(t=t, min_row=min_row, max_row=max_row)
else:
return t[min_row:max_row]
@torch.no_grad
def maxsim(
X: torch.Tensor, y: torch.Tensor,
maximize: bool, chunk_size: int = 8_000,
debug_mem_usage: bool = False) -> torch.Tensor:
device = X.device
n_samples = X.shape[0]
max_sim_v = torch.zeros(n_samples, device=device, dtype=X.dtype)
max_sim_i = torch.zeros(n_samples, device=device, dtype=torch.int64)
# TODO: Implement faster max (without going to dense tensors).
# TODO: Use multiple GPUs.
rank = get_rank()
world_size = get_world_size()
worker_worklist_size = int(math.ceil(n_samples / world_size))
splits_start_idx = worker_worklist_size * rank
splits_end_idx = worker_worklist_size * (rank + 1)
for i in range(splits_start_idx, splits_end_idx, chunk_size):
start, end = i, min(i + chunk_size, n_samples)
sub_x = slice_tensor_rows(X, start, end)
if debug_mem_usage: print(f"[maxsim] step {i} cuda mem free/total = {torch.cuda.mem_get_info()}")
if debug_mem_usage: print("[maxsim] sub_x.shape:", sub_x.shape, "//", "y.shape:", y.shape)
sub_sim = sub_x @ y # TODO – Implement sparse max here to save mem!
sub_sim = sub_sim
if maximize:
sub_max_sim_v, sub_max_sim_i = sub_sim.to_dense().max(dim=-1)
else:
sub_max_sim_v, sub_max_sim_i = sub_sim.to_dense().min(dim=-1)
del sub_sim
del sub_x
torch.cuda.empty_cache() # needs to happen after maxsim for some reason.
max_sim_v[start: end] = sub_max_sim_v
max_sim_i[start: end] = sub_max_sim_i
# gather
max_sim_v = gather_sum(max_sim_v)
max_sim_i = gather_sum(max_sim_i)
k = y.shape[1]
assert max_sim_v.shape == (n_samples,)
assert max_sim_i.shape == (n_samples,)
assert max_sim_i.min() >= 0
assert max_sim_i.max() <= k
return max_sim_v, max_sim_i
def forward_batched(
model: torch.nn.Module,
input_ids: torch.Tensor,
attention_mask: torch.Tensor,
batch_size: int,
dataset_input_ids: Optional[torch.Tensor] = None,
dataset_attention_mask: Optional[torch.Tensor] = None,
**second_stage_model_kwargs,
) -> torch.Tensor:
if hasattr(model, "module"):
model = model.module
if hasattr(model, "first_stage_model"):
# Support pooling over 3D dataset_input_ids inputs.
if len(dataset_input_ids.shape) == 2:
dataset_input_ids = dataset_input_ids[None]
dataset_attention_mask = dataset_attention_mask[None]
dataset_embeddings = []
for j in range(len(dataset_input_ids)):
i = 0
dataset_embeddings_batch = []
while i < dataset_input_ids.shape[1]:
dataset_embeddings_batch.append(
model.first_stage_model(
input_ids=dataset_input_ids[j][i:i+batch_size],
attention_mask=dataset_attention_mask[j][i:i+batch_size],
)
)
i += batch_size
dataset_embeddings.append(
torch.cat(dataset_embeddings_batch, dim=0)
)
# Automatically pool over 3D dataset_input_ids.
dataset_embeddings = torch.stack(dataset_embeddings, dim=0).mean(dim=0)
j = 0
outputs = []
while j < len(input_ids):
outputs.append(
model.second_stage_model(
input_ids=input_ids[j:j+batch_size],
attention_mask=attention_mask[j:j+batch_size],
dataset_embeddings=dataset_embeddings,
**second_stage_model_kwargs,
)
)
j += batch_size
return torch.cat(outputs, dim=0)
else:
i = 0
outputs = []
while i < len(input_ids):
# breakpoint()
outputs.append(
model(
input_ids=input_ids[i:i+batch_size],
attention_mask=attention_mask[i:i+batch_size],
**second_stage_model_kwargs,
)
)
i += batch_size
return torch.cat(outputs, dim=0)
def last_token_pool(hidden_state: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
# https://github.com/ContextualAI/gritlm/blob/main/gritlm/gritlm.py#L190
b, n, d = hidden_state.size()
# Get the last `1` in the attention mask of each item
# Often it is just `gather_indices = torch.argmin(attention_mask, 1, keepdim=False) - 1`
# except when 1) There's all 1's 2) There's 0's before the 1's
reversed_mask = torch.flip(attention_mask, dims=(1,))
argmax_reverse = torch.argmax(reversed_mask, dim=1, keepdim=False)
gather_indices = attention_mask.size(1) - argmax_reverse - 1
# If there are empty sequences, where the index would become -1 it will crash so set them to 0
gather_indices = torch.clamp(gather_indices, min=0)
# Turn indices from shape [b] -> [b, 1, d]
gather_indices = gather_indices.unsqueeze(-1).repeat(1, d)
gather_indices = gather_indices.unsqueeze(1)
assert gather_indices.shape == (b, 1, d)
# Gather along the seq len: [b, n, d] -> [b, d]
# Actually no need for the attention mask as we gather the last token where attn_mask=1 but
# as some indices (which shouldn't be attended to) may be 0 due to clamp, use mask to ignore them again
input_mask_expanded = attention_mask.unsqueeze(-1).expand((b, n, d)).float()
return torch.gather(hidden_state * input_mask_expanded, 1, gather_indices).squeeze(dim=1)
def print0(*args, **kwargs) -> None:
if get_rank() == 0:
print(*args, **kwargs)
def limit_layers(model: transformers.PreTrainedModel, n_layers: int) -> None:
if hasattr(model, 'transformer'):
if hasattr(model.transformer, 'h'):
# gpt2
model.transformer.h = model.transformer.h[:n_layers]
else:
model.transformer.layer = model.transformer.layer[:n_layers]
elif hasattr(model, 'encoder'):
if hasattr(model.encoder, 'layers'):
model.encoder.layers = model.encoder.layers[:n_layers]
else:
model.encoder.layer = model.encoder.layer[:n_layers]
else:
raise RuntimeError(f"unknown how to limit layers of model {type(model)}")
def disable_dropout(model: torch.nn.Module):
dropout_modules = [m for m in model.modules() if isinstance(m, torch.nn.Dropout)]
for m in dropout_modules:
m.p = 0.0
print0(
f"Disabled {len(dropout_modules)} dropout modules from model type {type(model)}"
)
def disable_causality(model: torch.nn.Module):
disabled_modules = 0
for m in model.modules():
if hasattr(m, "is_causal"):
m.is_causal = False
disabled_modules += 1
print0(
f"Set is_causal=False in {disabled_modules} modules from model type {type(model)}"
)
class ContextualModelMixin(nn.Module):
@property
def num_corpus_tokens(self) -> int:
return self.transductive_corpus_size * self.transductive_tokens_per_document
def contextual_init(self):
self.n_soft_prompt = 8
self.prompt_projection = torch.nn.Sequential(
torch.nn.Linear(self.hidden_size, self.hidden_size),
torch.nn.ReLU(),
torch.nn.Linear(self.hidden_size, self.hidden_size * self.n_soft_prompt)
)
self.transductive_corpus_size = vars(self.config).get("transductive_corpus_size", 1)
self.transductive_tokens_per_document = vars(self.config).get("transductive_tokens_per_document", 1)
self.randomize_dataset_sequence_order = True
self.sequence_dropout_prob = vars(self.config).get("transductive_sequence_dropout_prob", 0.0)
if self.sequence_dropout_prob > 0.0:
self.sequence_dropout_null_embedding = torch.nn.Parameter(
torch.randn(self.hidden_size) * 0.01,
requires_grad = True
)
self.output_projection = torch.nn.Sequential(
torch.nn.Linear(self.hidden_size, self.hidden_size),
torch.nn.ReLU(),
torch.nn.Linear(self.hidden_size, self.hidden_size)
)
def _prepare_dataset_embeddings(
self,
input_ids: torch.Tensor, dataset_embeddings: torch.Tensor,
null_dataset_embedding: bool = False,
) -> torch.Tensor:
if not isinstance(dataset_embeddings, torch.Tensor):
dataset_embeddings = torch.tensor(dataset_embeddings)
if len(dataset_embeddings.shape) == 2:
# Auto-expand for a batch.
dataset_embeddings = dataset_embeddings[None, :, :] # (b, d) -> (1, b, d)
dataset_embeddings = dataset_embeddings.to(input_ids.device)
batch_size = input_ids.shape[0]
if (self.transductive_tokens_per_document > 1):
if self.training:
# Choose N random documents to fill our context window with.
# This logic is a little confusing but allows us to sample a
# different batch *per-document*
assert dataset_embeddings.shape[1] == self.transductive_tokens_per_document
R = torch.randint(
low=0,
high=len(dataset_embeddings),
size=(batch_size, self.config.transductive_corpus_size),
device=dataset_embeddings.device
)
# TODO make this deterministic somehow for evaluation?
dataset_embeddings = dataset_embeddings[R].reshape((batch_size, self.num_corpus_tokens, self.hidden_size))
else:
dataset_embeddings = dataset_embeddings.reshape((1, self.num_corpus_tokens, self.hidden_size))
# print("reshaped to dataset_embeddings.shape =", dataset_embeddings.shape)
if dataset_embeddings.shape[1] > self.num_corpus_tokens:
# If too many dataset embeddings are passed in, just take the first N until
# we have the proper number.
dataset_embeddings = dataset_embeddings[:, :self.num_corpus_tokens, :]
_, corpus_size, _hidden_size = dataset_embeddings.shape
if _ == 1:
# Auto-expand for a batch.
dataset_embeddings = dataset_embeddings.expand((batch_size, -1, -1))
if self.training and self.sequence_dropout_prob > 0.0:
sequence_dropout_mask = (
torch.rand((batch_size, corpus_size), device=dataset_embeddings.device) < self.sequence_dropout_prob
)
null_embeddings = self.sequence_dropout_null_embedding[None, None].expand(batch_size, corpus_size, -1)
dataset_embeddings = torch.where(
sequence_dropout_mask[..., None], null_embeddings, dataset_embeddings
)
elif null_dataset_embedding:
null_embeddings = self.sequence_dropout_null_embedding[None, None].expand(batch_size, corpus_size, -1)
dataset_embeddings = null_embeddings
# print(f"[ContextualModelMixin] dataset_embeddings.shape = {dataset_embeddings.shape}")
# backbone_max_seq_length = self.backbone.config.max_trained_positions
# assert batch_size + (2 * self.n_soft_prompt + corpus_size) <= backbone_max_seq_length, "too many hard negatives for backbone model"
soft_prompt = torch.ones((1, self.hidden_size), device=dataset_embeddings.device, dtype=dataset_embeddings.dtype)
soft_prompt = self.prompt_projection(soft_prompt).reshape((1, self.n_soft_prompt, self.hidden_size))
soft_prompt = soft_prompt.expand((len(dataset_embeddings), -1, -1)) # -> (b, 4+b, d) # soft_prompt.repeat((len(input_ids), 1, 1))
soft_prompt = torch.cat((dataset_embeddings, soft_prompt), dim=1)
# print(f"[ContextualModelMixin] soft_prompt.shape = {soft_prompt.shape}")
if self.training and self.randomize_dataset_sequence_order:
randomized_order = torch.stack(
[
torch.cat(
(
torch.randperm(corpus_size, device=soft_prompt.device),
torch.arange(self.n_soft_prompt, device=soft_prompt.device) + corpus_size
), dim=0)
for _ in range(batch_size)])
randomized_order = randomized_order.to(soft_prompt.device)
soft_prompt = soft_prompt.gather(1, randomized_order[..., None].expand_as(soft_prompt))
return soft_prompt
class BiEncoder(transformers.PreTrainedModel):
embedder: transformers.PreTrainedModel
def __init__(
self,
config, #: transformers.PreTrainedConfig,
):
super().__init__(config=config)
embedder, _ = load_embedder_and_tokenizer(
config.embedder,
)
if config.limit_layers:
print0(f"Limiting layers to {config.limit_layers}")
limit_layers(embedder, config.limit_layers)
self.embedder = embedder
# if ("t5" in embedder.config.model_type):
# print0(f"using torch.compile() on embedder of type `{embedder.config.model_type}`")
# self.embedder = torch.compile(self.embedder)
self.hidden_size = self.embedder.config.hidden_size
# Allow pooling to multiple tokens per document
self.transductive_tokens_per_document = vars(self.config).get("transductive_tokens_per_document", 1)
self.mlp = torch.nn.Sequential(
torch.nn.Linear(self.hidden_size, self.hidden_size),
torch.nn.GELU(),
torch.nn.Linear(self.hidden_size, self.config.embedding_output_dim or self.hidden_size),
)
self.temp = config.logit_scale
if config.disable_dropout:
disable_dropout(self)
self.pooling_strategy = vars(config).get("pooling_strategy", "mean")
def forward(
self,
input_ids: torch.Tensor,
attention_mask: torch.Tensor,
dataset_input_ids: Optional[torch.Tensor] = None,
dataset_attention_mask: Optional[torch.Tensor] = None,
token_type_ids = None,
output_hidden_states: bool = False,
) -> torch.Tensor:
"""
query_embedding (float torch.Tensor) - shape (batch_size, embedding_dim)
document_embeddings (float torch.Tensor) - shape (corpus_size, embedding_dim)
where the corpus_size >= batch_size and is structured like this:
[d1, d2, d3, hn1_1, hn1_2, hn2_1, hn2_2, hn3_1, hn3_2]
for a corpus with three documents and two hard negatives per document
"""
# del dataset_input_ids
# del dataset_attention_mask
del token_type_ids
# from cde.lib.dist import get_rank
# tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-uncased")
# if get_rank() == 0:
# breakpoint()
# torch.distributed.barrier()
outputs = (
self.embedder(
input_ids=input_ids,
attention_mask=attention_mask,
).last_hidden_state
)
if self.transductive_tokens_per_document > 1:
document_embeddings = None
batch_size, seq_length, output_dim = outputs.shape
if seq_length % self.transductive_tokens_per_document != 0:
# Pad to nearest multiple
n_extra_embeds = self.transductive_tokens_per_document - (seq_length % self.transductive_tokens_per_document)
outputs = torch.cat(
(outputs, torch.zeros((batch_size, n_extra_embeds, output_dim), device=outputs.device)),
dim=1
)
attention_mask = torch.cat(
(attention_mask, torch.zeros((batch_size, n_extra_embeds), device=attention_mask.device)),
dim=1
)
seq_length += n_extra_embeds
print(f"Added {n_extra_embeds} padding tokens to input_ids and attention_mask")
# print("ftransductive_tokens_per_document {self.transductive_tokens_per_document} outputs.shape =", outputs.shape)
outputs = outputs.reshape(
(batch_size, self.transductive_tokens_per_document, seq_length // self.transductive_tokens_per_document, output_dim)
)
attention_mask = attention_mask.reshape((batch_size, self.transductive_tokens_per_document, -1))
document_embeddings = mean_pool_3d(outputs, attention_mask)
document_embeddings = document_embeddings.reshape((batch_size, self.transductive_tokens_per_document, output_dim))
else:
if self.pooling_strategy == "mean":
document_embeddings = mean_pool(outputs, attention_mask)
else:
document_embeddings = document_embeddings.max(dim=1)
output = self.mlp(document_embeddings)
if output_hidden_states:
return {
"hidden_states": outputs,
"pooled": output,
}
else:
return output
class DatasetConditionedAutoregressive(transformers.PreTrainedModel, ContextualModelMixin):
def __init__(
self,
config,
dataset_backbone: transformers.PreTrainedModel,
first_stage_hidden_size: int,
):
super().__init__(config=config)
self.backbone = dataset_backbone
self.backbone_hidden_size = self.backbone.config.hidden_size
self.hidden_size = first_stage_hidden_size # Input token size
self.contextual_init()
disable_causality(self.backbone)
self.input_ln = torch.nn.LayerNorm(
self.backbone_hidden_size,
eps=1e-5
)
# Override contextual init
self.output_projection = torch.nn.Sequential(
torch.nn.Linear(self.backbone_hidden_size, self.backbone_hidden_size),
torch.nn.ReLU(),
torch.nn.Linear(self.backbone_hidden_size, self.backbone_hidden_size)
)
self._shift_rotary_embedding()
@property
def num_corpus_tokens(self) -> int:
return self.config.transductive_corpus_size * self.transductive_tokens_per_document
@property
def corpus_token_ratio(self) -> float:
# How many tokens from the first stage make one token in the second
# stage?
return self.backbone_hidden_size / self.hidden_size
def corpus_token_pad_size(self, n_tokens: int) -> int:
return self.hidden_size % self.backbone_hidden_size
def _shift_rotary_embedding(self) -> None:
disable_transductive_rotary_embedding = vars(self.config).get("disable_transductive_rotary_embedding", True)
# TODO: Can we do this for LLAMA?
print("Warning: Positional embedding disabling not implemented for LLAMA.")
def forward(
self,
input_ids: torch.Tensor,
attention_mask: torch.Tensor,
dataset_embeddings: torch.Tensor,
output_hidden_states: bool = False,
null_dataset_embedding: bool = False,
) -> torch.Tensor:
soft_prompt = self._prepare_dataset_embeddings(
input_ids=input_ids,
dataset_embeddings=dataset_embeddings,
null_dataset_embedding=null_dataset_embedding,
)
# Reshape for this model.
# print("[DatasetConditionedAutoregressive] 1 -> soft_prompt.shape =", soft_prompt.shape)
num_soft_elements = torch.prod(torch.tensor(soft_prompt.shape[1:])).item()
soft_prompt = soft_prompt.reshape((soft_prompt.shape[0], num_soft_elements))
num_padding_elements = self.backbone_hidden_size - (num_soft_elements % self.backbone_hidden_size)
padding = torch.ones((soft_prompt.shape[0], num_padding_elements), device=soft_prompt.device)
soft_prompt = torch.cat((soft_prompt, padding), dim=1)
soft_prompt = soft_prompt.reshape(
(soft_prompt.shape[0], -1, self.backbone_hidden_size)
)
soft_prompt = self.input_ln(soft_prompt)
# print("[DatasetConditionedAutoregressive] 2 -> soft_prompt.shape =", soft_prompt.shape)
backbone_attention_mask = torch.ones(
soft_prompt.shape[0:2],
dtype=torch.long,
device=soft_prompt.device,
)
token_embeddings = self.backbone.get_input_embeddings()
inputs_embeds = token_embeddings(input_ids) # (b, s) -> (b, s, d)
# print("[2] inputs_embeds.shape =", inputs_embeds.shape)
inputs_embeds = torch.cat((soft_prompt, inputs_embeds), dim=1) # (v, 4+b+s, d)
# print("[3.a] inputs_embeds.shape =", inputs_embeds.shape)
input_attention_mask = torch.cat((backbone_attention_mask, attention_mask), dim=1)
# print("[3.b] attention_mask.shape =", attention_mask.shape)
output = self.backbone(
inputs_embeds=inputs_embeds,
attention_mask=input_attention_mask,
output_hidden_states=True,
) # (1, 4 + b + s, d)
# trim soft prompt
last_hidden_state = output.hidden_states[-1]
n_soft_prompt_tokens = soft_prompt.shape[1]
output_vectors = last_hidden_state[:, n_soft_prompt_tokens:, :]
output_attention_mask = input_attention_mask[:, n_soft_prompt_tokens:]
# Take last token position
if vars(self.config).get("pooling_strategy") == "last_token":
output_pooled = last_token_pool(output_vectors, output_attention_mask)
elif vars(self.config).get("pooling_strategy") == "mean":
output_pooled = mean_pool(output_vectors, output_attention_mask)
else:
output_pooled = mean_pool_weighted(output_vectors, output_attention_mask)
# average with original vectors
# TODO: Argparse for pooling strategy.
output = self.output_projection(output_pooled) # (b, 2d) -> (b, d)
if output_hidden_states:
return {
"hidden_states": output_vectors,
"pooled": output,
}
else:
return output
class DatasetConditionedBiencoder(transformers.PreTrainedModel, ContextualModelMixin):
def __init__(
self,
config,
dataset_backbone: transformers.PreTrainedModel,
):
super().__init__(config=config)
self.backbone = dataset_backbone
self.hidden_size = self.backbone.config.hidden_size
self.hidden_size = dataset_backbone.config.hidden_size
# self.input_ln = torch.nn.LayerNorm(
# self.hidden_size,
# eps=self.backbone.config.layer_norm_epsilon
# )
self.contextual_init()
self._shift_rotary_embedding()
@property
def num_corpus_tokens(self) -> int:
return self.config.transductive_corpus_size * self.transductive_tokens_per_document
def _shift_rotary_embedding(self) -> None:
disable_transductive_rotary_embedding = vars(self.config).get("disable_transductive_rotary_embedding", True)
if self.backbone.config.model_type.startswith("nomic") and disable_transductive_rotary_embedding:
# We only want to apply positional embeddings to the
# *text* portion of the backbone network.
self.backbone.config.rotary_start_pos = 0.0
rotary_disabled = 0
rotary_start_pos = self.num_corpus_tokens
for module in self.backbone.modules():
if hasattr(module, "rotary_emb_dim"):
module.rotary_start_pos = rotary_start_pos
rotary_disabled += 1
print0(f"modified {rotary_disabled} rotary modules – set rotary_start_pos to {rotary_start_pos}")
def forward(
self,
input_ids: torch.Tensor,
attention_mask: torch.Tensor,
dataset_embeddings: torch.Tensor,
output_hidden_states: bool = False,
null_dataset_embedding: bool = False,
) -> torch.Tensor:
# print(f"[DatasetConditionedBiencoder - 0] input_ids.shape => {input_ids.shape} // dataset_embeddings.shape =", dataset_embeddings.shape)
soft_prompt = self._prepare_dataset_embeddings(
input_ids=input_ids,
dataset_embeddings=dataset_embeddings,
null_dataset_embedding=null_dataset_embedding,
)
# print(f"[DatasetConditionedBiencoder - 1] soft_prompt.shape => {soft_prompt.shape}")
backbone_attention_mask = torch.ones(
soft_prompt.shape[0:2],
dtype=torch.long,
device=soft_prompt.device,
)
inputs_embeds = self.backbone.embeddings(input_ids) # (b, s) -> (b, s, d)
# print("[2] inputs_embeds.shape =", inputs_embeds.shape)
inputs_embeds = torch.cat((soft_prompt, inputs_embeds), dim=1) # (v, 4+b+s, d)
# print("[3.a] inputs_embeds.shape =", inputs_embeds.shape)
attention_mask = torch.cat((backbone_attention_mask, attention_mask), dim=1)
# print("[3.b] attention_mask.shape =", attention_mask.shape)
output = self.backbone(
inputs_embeds=inputs_embeds,
attention_mask=attention_mask,
) # (1, 4 + b + s, d)
# trim soft prompt
output_vectors = output.last_hidden_state
# use only these tokens
n_soft_prompt_tokens = soft_prompt.shape[1]
# print("n_soft_prompt_tokens =", n_soft_prompt_tokens)
output_vectors = output.last_hidden_state[:, n_soft_prompt_tokens:, :]
output_attention_mask = attention_mask[:, n_soft_prompt_tokens:]
# print("pooling output_vectors.shape =", output_vectors.shape, "and output_attention_mask.shape =", output_attention_mask.shape)
output_pooled = mean_pool(output_vectors, output_attention_mask)
# average with original vectors
# TODO: Argparse for pooling strategy.
# output_vectors = torch.cat((soft_prompt_pooled, output_pooled), dim=1) # (b, d) + (b, d) -> (b, 2d)
# print("output_pooled.shape =", output_pooled.shape)
output = self.output_projection(output_pooled) # (b, 2d) -> (b, d)
# print("returning output.shape =", output.shape)
if output_hidden_states:
return {
"hidden_states": output_vectors,
"pooled": output,
}
else:
return output
class DatasetPrefixBiencoder(transformers.PreTrainedModel, ContextualModelMixin):
def __init__(
self,
config, #: transformers.PreTrainedConfig,
embedder: transformers.PreTrainedModel,
):
super().__init__(config=config)
self.embedder = embedder
self.hidden_size = self.embedder.config.hidden_size
self.contextual_init()
def forward(
self,
input_ids: torch.Tensor,
attention_mask: torch.Tensor,
dataset_input_ids: torch.Tensor,
dataset_attention_mask: torch.Tensor,
output_hidden_states: bool = False,
) -> torch.Tensor:
R = torch.randint(low=0, high=len(dataset_input_ids), size=(len(input_ids),), device=dataset_input_ids.device)
dataset_input_ids = dataset_input_ids[R]
input_ids = torch.cat((dataset_input_ids, input_ids), dim=1)
dataset_attention_mask = torch.ones_like(dataset_attention_mask, device=dataset_attention_mask.device)
input_attention_mask = torch.cat((dataset_attention_mask, attention_mask), dim=1)
output_attention_mask = torch.cat(
(torch.zeros_like(dataset_input_ids), attention_mask), dim=1
)
output = self.embedder(
input_ids=input_ids,
attention_mask=input_attention_mask,
)
output_vectors = output.last_hidden_state
output_pooled = mean_pool(output_vectors, output_attention_mask)
output = self.output_projection(output_pooled) # (b, 2d) -> (b, d)
if output_hidden_states:
S_d = dataset_attention_mask.shape[1]
output_vectors = output_vectors[:, S_d:, :]
return {
"hidden_states": output_vectors,
"pooled": output,
}
else:
return output
class DatasetTransformer(transformers.PreTrainedModel):
config_class = ContextualModelConfig
embedder: transformers.PreTrainedModel
dataset_backbone: transformers.PreTrainedModel
def __init__(
self,
config,
):
super().__init__(config=config)
dataset_backbone, _ = load_embedder_and_tokenizer(
vars(config).get("dataset_backbone", config.embedder)
)
if config.limit_layers:
print0(f"Limiting layers to {config.limit_layers}")
limit_layers(dataset_backbone, config.limit_layers)
biencoder_config = copy.deepcopy(config)
biencoder_config.embedding_output_dim = None
biencoder_config.limit_layers = vars(self.config).get("limit_layers_first_stage", None)
self.first_stage_model = BiEncoder(
config=biencoder_config,
)
if vars(config).get("autoregressive_backbone", False):
self.second_stage_model = DatasetConditionedAutoregressive(
config=config,
dataset_backbone=dataset_backbone,
first_stage_hidden_size=self.first_stage_model.hidden_size,
)
else:
self.second_stage_model = DatasetConditionedBiencoder(
config=config,
dataset_backbone=dataset_backbone
)
self.temp = config.logit_scale
if config.disable_dropout:
disable_dropout(self)
transductive_tie_token_embeddings = vars(self.config).get("transductive_tie_token_embeddings", False)
if transductive_tie_token_embeddings:
self.second_stage_model.backbone.embeddings.word_embeddings.weight = (
self.first_stage_model.embedder.embeddings.word_embeddings.weight
)
def forward(
self,
input_ids: torch.Tensor,
attention_mask: torch.Tensor,
dataset_input_ids: Optional[torch.Tensor],
dataset_attention_mask: Optional[torch.Tensor],
output_hidden_states: bool = False,
) -> torch.Tensor:
"""
input_ids (long torch.Tensor) – ids of input tokens
attention_mask (bool torch.Tensor)
"""
dataset_embeddings = self.first_stage_model(
input_ids=dataset_input_ids,
attention_mask=dataset_attention_mask
)
return self.second_stage_model(
input_ids=input_ids,
attention_mask=attention_mask,
dataset_embeddings=dataset_embeddings,
output_hidden_states=output_hidden_states,
)
def get_model_class(name: str):
if name in 'transductive':
return DatasetTransformer
elif name == 'biencoder':
return BiEncoder
elif name == "dataset_prefix_biencoder":
return DatasetPrefixBiencoder
else:
raise ValueError(f'unknown model cls {name}')