BiomedCLIP-vit-bert-hf / modeling_biomed_clip.py

Upload 8 files

e726b94 verified 6 months ago

39.6 kB

	# coding=utf-8
	# Modified by chuhac for a timm-free implementation
	# Model can be directly imported with ``from_pretrained`` and ``trust_remote_code = True`` in the huggingface format
	# Diff from HF CLIP Implementation:
	# 1. pre-norm instead of post-norm in Vision Tower (the original implementation is right but the module registration order is misleading)
	# 2. CLS Pooling with MLP in Text Tower
	# 3. Remove pre norm in Vision Tower
	# 4. CNN bias in Vision Tower
	# 5. Change layer_norm eps from 1e-5 to 1e-12, which introduce a little numerical variations (1e-5 level)
	## ******************************** ##
	# Copyright 2021 The OpenAI Team Authors and The HuggingFace Team. All rights reserved.
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	""" PyTorch BiomedCLIP model """
	""" No need for timm or open-clip-torch """


	from dataclasses import dataclass
	from typing import Any, Optional, Tuple, Union, List

	import math
	import torch
	import torch.utils.checkpoint
	from torch import nn
	from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

	from transformers.activations import ACT2FN
	from transformers.modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask
	from transformers.modeling_outputs import (
	BaseModelOutput,
	BaseModelOutputWithPooling,
	ImageClassifierOutput,
	BaseModelOutputWithPoolingAndCrossAttentions,
	BaseModelOutputWithPastAndCrossAttentions
	)
	from transformers.modeling_utils import PreTrainedModel
	from transformers.utils import (
	ModelOutput,
	add_code_sample_docstrings,
	add_start_docstrings,
	add_start_docstrings_to_model_forward,
	logging,
	replace_return_docstrings,
	)
	from transformers.models.clip.configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
	from transformers.models.clip.modeling_clip import *

	from .configuration_biomed_clip import BiomedCLIPTextProjectionConfig, BiomedCLIPConfig


	logger = logging.get_logger(__name__)



	# contrastive loss function, adapted from
	# https://sachinruk.github.io/blog/2021-03-07-clip.html
	def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
	return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))


	def clip_loss(similarity: torch.Tensor) -> torch.Tensor:
	caption_loss = contrastive_loss(similarity)
	image_loss = contrastive_loss(similarity.t())
	return (caption_loss + image_loss) / 2.0


	class BiomedCLIPVisionEmbeddings(CLIPVisionEmbeddings):
	def __init__(self, config: CLIPVisionConfig):
	super().__init__(config)

	self.patch_embedding = nn.Conv2d(
	in_channels=config.num_channels,
	out_channels=self.embed_dim,
	kernel_size=self.patch_size,
	stride=self.patch_size,
	# True in open_clip
	bias=True,
	)

	# TODO
	class BiomedCLIPTextEmbeddings(nn.Module):
	def __init__(self, config: CLIPTextConfig):
	super().__init__()
	embed_dim = config.hidden_size

	self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
	self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
	self.token_type_embedding = nn.Embedding(config.type_vocab_size, embed_dim)

	self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
	self.dropout = nn.Dropout(config.hidden_dropout_prob)
	# position_ids (1, len position emb) is contiguous in memory and exported when serialized
	self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")

	# position_ids (1, len position emb) is contiguous in memory and exported when serialized
	self.register_buffer(
	"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
	)
	self.register_buffer(
	"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
	)

	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	token_type_ids: Optional[torch.LongTensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	past_key_values_length: int = 0,
	) -> torch.Tensor:

	if input_ids is not None:
	input_shape = input_ids.size()
	else:
	input_shape = inputs_embeds.size()[:-1]

	seq_length = input_shape[1]

	if position_ids is None:
	position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]

	# Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
	# when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
	# issue #5664
	if token_type_ids is None:
	if hasattr(self, "token_type_ids"):
	buffered_token_type_ids = self.token_type_ids[:, :seq_length]
	buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
	token_type_ids = buffered_token_type_ids_expanded
	else:
	token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

	if inputs_embeds is None:
	inputs_embeds = self.token_embedding(input_ids)
	token_type_embeddings = self.token_type_embedding(token_type_ids)

	embeddings = inputs_embeds + token_type_embeddings
	if self.position_embedding_type == "absolute":
	position_embeddings = self.position_embedding(position_ids)
	embeddings += position_embeddings

	embeddings = self.layer_norm(embeddings)
	embeddings = self.dropout(embeddings)
	return embeddings


	class BiomedCLIPAttention(nn.Module):
	def __init__(self, config, position_embedding_type=None):
	super().__init__()
	super().__init__()
	self.config = config
	self.embed_dim = config.hidden_size
	self.num_heads = config.num_attention_heads
	self.head_dim = self.embed_dim // self.num_heads
	if self.head_dim * self.num_heads != self.embed_dim:
	raise ValueError(
	f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
	f" {self.num_heads})."
	)
	self.scale = self.head_dim**-0.5
	self.dropout = nn.Dropout(config.attention_dropout)

	self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
	self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
	self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
	self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)

	def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
	new_x_shape = x.size()[:-1] + (self.num_heads, self.head_dim)
	x = x.view(new_x_shape)
	return x.permute(0, 2, 1, 3)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.FloatTensor] = None,
	encoder_hidden_states: Optional[torch.FloatTensor] = None,
	encoder_attention_mask: Optional[torch.FloatTensor] = None,
	past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
	output_attentions: Optional[bool] = False,
	) -> Tuple[torch.Tensor]:

	mixed_query_layer = self.q_proj(hidden_states)

	# If this is instantiated as a cross-attention module, the keys
	# and values come from an encoder; the attention mask needs to be
	# such that the encoder's padding tokens are not attended to.
	is_cross_attention = encoder_hidden_states is not None

	key_layer = self.transpose_for_scores(self.k_proj(hidden_states))
	value_layer = self.transpose_for_scores(self.v_proj(hidden_states))

	query_layer = self.transpose_for_scores(mixed_query_layer)


	# Take the dot product between "query" and "key" to get the raw attention scores.
	attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))


	attention_scores = attention_scores / math.sqrt(self.head_dim)
	if attention_mask is not None:
	# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
	attention_scores = attention_scores + attention_mask

	# Normalize the attention scores to probabilities.
	attention_probs = nn.functional.softmax(attention_scores, dim=-1)

	# This is actually dropping out entire tokens to attend to, which might
	# seem a bit unusual, but is taken from the original Transformer paper.
	attention_probs = self.dropout(attention_probs)


	context_layer = torch.matmul(attention_probs, value_layer)

	context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
	new_context_layer_shape = context_layer.size()[:-2] + (self.embed_dim,)
	context_layer = context_layer.view(new_context_layer_shape).contiguous()

	outputs = self.out_proj(context_layer)
	return outputs, attention_probs




	class BiomedCLIPEncoderLayer(nn.Module):
	def __init__(self, config: BiomedCLIPConfig, norm='pre'):
	super().__init__()
	self.embed_dim = config.hidden_size
	# pre-norm
	self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
	self.self_attn = BiomedCLIPAttention(config)
	self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
	self.mlp = CLIPMLP(config)
	self.norm = norm

	if self.norm == 'pre':
	self.forward = self.pre_norm_forward
	elif self.norm == 'post':
	self.forward = self.post_norm_forward


	def pre_norm_forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: torch.Tensor,
	output_attentions: Optional[bool] = False,
	) -> Tuple[torch.FloatTensor]:
	"""
	Args:
	hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
	attention_mask (`torch.FloatTensor`): attention mask of size
	`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
	`(config.encoder_attention_heads,)`.
	output_attentions (`bool`, optional):
	Whether or not to return the attentions tensors of all attention layers. See `attentions` under
	returned tensors for more detail.
	"""
	residual = hidden_states

	hidden_states = self.layer_norm1(hidden_states)
	hidden_states, attn_weights = self.self_attn(
	hidden_states=hidden_states,
	attention_mask=attention_mask,
	output_attentions=output_attentions,
	)
	hidden_states = residual + hidden_states

	residual = hidden_states
	hidden_states = self.layer_norm2(hidden_states)
	hidden_states = self.mlp(hidden_states)
	hidden_states = residual + hidden_states

	outputs = (hidden_states,)

	if output_attentions:
	outputs += (attn_weights,)

	return outputs

	def post_norm_forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: torch.Tensor,
	output_attentions: Optional[bool] = False,
	) -> Tuple[torch.FloatTensor]:
	"""
	Args:
	hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
	attention_mask (`torch.FloatTensor`): attention mask of size
	`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
	`(config.encoder_attention_heads,)`.
	output_attentions (`bool`, optional):
	Whether or not to return the attentions tensors of all attention layers. See `attentions` under
	returned tensors for more detail.
	"""
	residual = hidden_states

	hidden_states, attn_weights = self.self_attn(
	hidden_states=hidden_states,
	attention_mask=attention_mask,
	output_attentions=output_attentions,
	)
	hidden_states = residual + hidden_states

	hidden_states = self.layer_norm1(hidden_states)

	residual = hidden_states
	hidden_states = self.mlp(hidden_states)
	hidden_states = residual + hidden_states
	hidden_states = self.layer_norm2(hidden_states)
	outputs = (hidden_states,)

	if output_attentions:
	outputs += (attn_weights,)

	return outputs


	class BiomedCLIPTextProjection(nn.Module):
	def __init__(self, config):
	super().__init__()
	self.config = config
	self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
	self.activation_fn = ACT2FN[config.hidden_act]
	self.fc2 = nn.Linear(config.intermediate_size, config.projection_dim, bias=False)

	def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
	hidden_states = self.fc1(hidden_states)
	hidden_states = self.activation_fn(hidden_states)
	hidden_states = self.fc2(hidden_states)
	return hidden_states


	class BiomedCLIPEncoder(nn.Module):
	"""
	Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
	[`BiomedCLIPEncoderLayer`].

	Args:
	config: BiomedCLIPConfig
	"""
	def __init__(self, config, norm='pre'):
	super().__init__()
	self.config = config
	self.norm = norm
	self.layers = nn.ModuleList([BiomedCLIPEncoderLayer(config, norm) for _ in range(config.num_hidden_layers)])
	self.gradient_checkpointing = False

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.FloatTensor] = None,
	head_mask: Optional[torch.FloatTensor] = None,
	encoder_hidden_states: Optional[torch.FloatTensor] = None,
	encoder_attention_mask: Optional[torch.FloatTensor] = None,
	past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = False,
	output_hidden_states: Optional[bool] = False,
	return_dict: Optional[bool] = True,
	) :
	all_hidden_states = () if output_hidden_states else None
	all_self_attentions = () if output_attentions else None
	all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None

	if self.gradient_checkpointing and self.training:
	if use_cache:
	logger.warning_once(
	"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
	)
	use_cache = False

	next_decoder_cache = () if use_cache else None
	for i, layer_module in enumerate(self.layers):
	if output_hidden_states:
	all_hidden_states = all_hidden_states + (hidden_states,)

	layer_head_mask = head_mask[i] if head_mask is not None else None
	past_key_value = past_key_values[i] if past_key_values is not None else None

	if self.gradient_checkpointing and self.training:
	layer_outputs = self._gradient_checkpointing_func(
	layer_module.__call__,
	hidden_states,
	attention_mask,
	output_attentions,
	)
	else:
	layer_outputs = layer_module(
	hidden_states,
	attention_mask,
	output_attentions,
	)

	hidden_states = layer_outputs[0]
	if use_cache:
	next_decoder_cache += (layer_outputs[-1],)
	if output_attentions:
	all_self_attentions = all_self_attentions + (layer_outputs[1],)
	if self.config.add_cross_attention:
	all_cross_attentions = all_cross_attentions + (layer_outputs[2],)

	if output_hidden_states:
	all_hidden_states = all_hidden_states + (hidden_states,)

	if not return_dict:
	return tuple(
	v
	for v in [
	hidden_states,
	next_decoder_cache,
	all_hidden_states,
	all_self_attentions,
	all_cross_attentions,
	]
	if v is not None
	)
	return BaseModelOutputWithPastAndCrossAttentions(
	last_hidden_state=hidden_states,
	past_key_values=next_decoder_cache,
	hidden_states=all_hidden_states,
	attentions=all_self_attentions,
	cross_attentions=all_cross_attentions,
	)



	class BiomedCLIPTextTransformer(CLIPPreTrainedModel):
	def __init__(self, config: CLIPTextConfig):
	super().__init__(config)
	self.config = config
	embed_dim = config.hidden_size
	self.embeddings = BiomedCLIPTextEmbeddings(config)
	self.encoder = BiomedCLIPEncoder(config, norm='post')
	# no final_ln
	# self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)

	# For `pooled_output` computation

	def forward(
	self,
	input_ids: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	token_type_ids: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.Tensor] = None,
	inputs_embeds: Optional[torch.Tensor] = None,
	encoder_hidden_states: Optional[torch.Tensor] = None,
	encoder_attention_mask: Optional[torch.Tensor] = None,
	past_key_values: Optional[List[torch.FloatTensor]] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
	r"""
	encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, optional):
	Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
	the model is configured as a decoder.
	encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, optional):
	Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
	the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:

	- 1 for tokens that are not masked,
	- 0 for tokens that are masked.
	past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
	Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.

	If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
	don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
	`decoder_input_ids` of shape `(batch_size, sequence_length)`.
	use_cache (`bool`, optional):
	If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
	`past_key_values`).
	"""
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	if self.config.is_decoder:
	use_cache = use_cache if use_cache is not None else self.config.use_cache
	else:
	use_cache = False

	if input_ids is not None and inputs_embeds is not None:
	raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
	elif input_ids is not None:
	input_shape = input_ids.size()
	elif inputs_embeds is not None:
	input_shape = inputs_embeds.size()[:-1]
	else:
	raise ValueError("You have to specify either input_ids or inputs_embeds")

	batch_size, seq_length = input_shape
	device = input_ids.device if input_ids is not None else inputs_embeds.device

	# past_key_values_length
	past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0

	if token_type_ids is None:
	if hasattr(self.embeddings, "token_type_ids"):
	buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
	buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
	token_type_ids = buffered_token_type_ids_expanded
	else:
	token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)

	embedding_output = self.embeddings(
	input_ids=input_ids,
	position_ids=position_ids,
	token_type_ids=token_type_ids,
	inputs_embeds=inputs_embeds,
	past_key_values_length=past_key_values_length,
	)

	if attention_mask is None:
	attention_mask = torch.ones((batch_size, seq_length + past_key_values_length), device=device)

	# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
	# ourselves in which case we just need to make it broadcastable to all heads.
	extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)

	# If a 2D or 3D attention mask is provided for the cross-attention
	# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
	if self.config.is_decoder and encoder_hidden_states is not None:
	encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
	encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
	if encoder_attention_mask is None:
	encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)

	if use_sdpa_attention_masks:
	# Expand the attention mask for SDPA.
	# [bsz, seq_len] -> [bsz, 1, seq_len, seq_len]
	encoder_extended_attention_mask = _prepare_4d_attention_mask_for_sdpa(
	encoder_attention_mask, embedding_output.dtype, tgt_len=seq_length
	)
	else:
	encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
	else:
	encoder_extended_attention_mask = None


	encoder_outputs = self.encoder(
	embedding_output,
	attention_mask=extended_attention_mask,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)
	sequence_output = encoder_outputs[0]

	return (sequence_output, sequence_output[:, 0, :])



	class BiomedCLIPVisionTransformer(nn.Module):
	def __init__(self, config: CLIPVisionConfig):
	super().__init__()
	self.config = config
	embed_dim = config.hidden_size

	self.embeddings = BiomedCLIPVisionEmbeddings(config)
	# No pre_norm in open_clip Vision Tower
	# self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
	self.encoder = BiomedCLIPEncoder(config)
	self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)

	def forward(
	self,
	pixel_values: Optional[torch.FloatTensor] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[Tuple, BaseModelOutputWithPooling]:
	r"""
	Returns:

	"""
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	if pixel_values is None:
	raise ValueError("You have to specify pixel_values")

	hidden_states = self.embeddings(pixel_values)
	# hidden_states = self.pre_layrnorm(hidden_states)

	encoder_outputs = self.encoder(
	hidden_states=hidden_states,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)

	last_hidden_state = encoder_outputs[0]
	pooled_output = last_hidden_state[:, 0, :]
	pooled_output = self.post_layernorm(pooled_output)

	if not return_dict:
	return (last_hidden_state, pooled_output) + encoder_outputs[1:]

	return BaseModelOutputWithPooling(
	last_hidden_state=last_hidden_state,
	pooler_output=pooled_output,
	hidden_states=encoder_outputs.hidden_states,
	attentions=encoder_outputs.attentions,
	)


	class BiomedCLIPModel(CLIPPreTrainedModel):
	config_class = BiomedCLIPConfig
	_no_split_modules = ["BiomedCLIPTextEmbeddings", "BiomedCLIPEncoderLayer"]

	def __init__(self, config: BiomedCLIPConfig):
	super().__init__(config)

	if not isinstance(config.text_config, CLIPTextConfig):
	raise ValueError(
	"config.text_config is expected to be of type CLIPTextConfig but is of type"
	f" {type(config.text_config)}."
	)

	if not isinstance(config.vision_config, CLIPVisionConfig):
	raise ValueError(
	"config.vision_config is expected to be of type CLIPVisionConfig but is of type"
	f" {type(config.vision_config)}."
	)

	text_config = config.text_config
	text_projection_config = config.text_projection_config
	vision_config = config.vision_config


	self.projection_dim = config.projection_dim
	self.text_embed_dim = text_config.hidden_size
	self.vision_embed_dim = vision_config.hidden_size

	self.text_model = BiomedCLIPTextTransformer(text_config)
	self.vision_model = BiomedCLIPVisionTransformer(vision_config)

	self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)

	self.text_projection = BiomedCLIPTextProjection(text_projection_config)

	self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))

	# Initialize weights and apply final processing
	self.post_init()

	def get_text_features(
	self,
	input_ids: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	token_type_ids: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.Tensor] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> torch.FloatTensor:
	r"""
	Returns:
	text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
	applying the projection layer to the pooled output of [`CLIPTextModel`].

	Examples:

	```python
	>>> from transformers import AutoTokenizer, CLIPModel

	>>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
	>>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

	>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
	>>> text_features = model.get_text_features(**inputs)
	```"""
	# Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	text_outputs = self.text_model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	token_type_ids=token_type_ids,
	position_ids=position_ids,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)

	pooled_output = text_outputs[1]
	text_features = self.text_projection(pooled_output)

	return text_features

	def get_image_features(
	self,
	pixel_values: Optional[torch.FloatTensor] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> torch.FloatTensor:
	r"""
	Returns:
	image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
	applying the projection layer to the pooled output of [`CLIPVisionModel`].

	Examples:

	```python
	>>> from PIL import Image
	>>> import requests
	>>> from transformers import AutoProcessor, CLIPModel

	>>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
	>>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

	>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
	>>> image = Image.open(requests.get(url, stream=True).raw)

	>>> inputs = processor(images=image, return_tensors="pt")

	>>> image_features = model.get_image_features(**inputs)
	```"""
	# Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	vision_outputs = self.vision_model(
	pixel_values=pixel_values,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)

	pooled_output = vision_outputs[1] # pooled_output
	image_features = self.visual_projection(pooled_output)

	return image_features

	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	pixel_values: Optional[torch.FloatTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	token_type_ids: Optional[torch.LongTensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	return_loss: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[Tuple, CLIPOutput]:
	r"""
	Returns:

	Examples:

	```python
	>>> from PIL import Image
	>>> import requests
	>>> from transformers import AutoProcessor, CLIPModel

	>>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
	>>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

	>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
	>>> image = Image.open(requests.get(url, stream=True).raw)

	>>> inputs = processor(
	... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
	... )

	>>> outputs = model(**inputs)
	>>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
	>>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
	```"""
	# Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	vision_outputs = self.vision_model(
	pixel_values=pixel_values,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)

	text_outputs = self.text_model(
	input_ids=input_ids,
	token_type_ids=token_type_ids,
	attention_mask=attention_mask,
	position_ids=position_ids,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)

	image_embeds = vision_outputs[1]
	image_embeds = self.visual_projection(image_embeds)

	text_embeds = text_outputs[1]
	text_embeds = self.text_projection(text_embeds)

	# normalized features
	image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
	text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)

	# cosine similarity as logits
	logit_scale = self.logit_scale.exp()
	logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
	logits_per_image = logits_per_text.t()

	loss = None
	if return_loss:
	loss = clip_loss(logits_per_text)

	if not return_dict:
	output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
	return ((loss,) + output) if loss is not None else output

	return CLIPOutput(
	loss=loss,
	logits_per_image=logits_per_image,
	logits_per_text=logits_per_text,
	text_embeds=text_embeds,
	image_embeds=image_embeds,
	text_model_output=text_outputs,
	vision_model_output=vision_outputs,
	)


	class BiomedCLIPForImageClassification(CLIPPreTrainedModel):
	main_input_name = "pixel_values"

	def __init__(self, config: BiomedCLIPConfig) -> None:
	super().__init__(config)

	self.num_labels = config.num_labels
	self.vision_model = BiomedCLIPVisionTransformer(config.vision_config)

	# Classifier head
	self.classifier = (
	nn.Linear(config.vision_config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
	)

	# Initialize weights and apply final processing
	self.post_init()

	def forward(
	self,
	pixel_values: Optional[torch.Tensor] = None,
	labels: Optional[torch.Tensor] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[tuple, ImageClassifierOutput]:
	r"""
	labels (`torch.LongTensor` of shape `(batch_size,)`, optional):
	Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
	config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
	`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
	"""
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	outputs = self.vision_model(
	pixel_values,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)

	sequence_output = outputs[0]

	# average pool the patch tokens
	sequence_output = torch.mean(sequence_output[:, 1:, :], dim=1)
	# apply classifier
	logits = self.classifier(sequence_output)

	loss = None
	if labels is not None:
	# move labels to correct device to enable model parallelism
	labels = labels.to(logits.device)
	if self.config.problem_type is None:
	if self.num_labels == 1:
	self.config.problem_type = "regression"
	elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
	self.config.problem_type = "single_label_classification"
	else:
	self.config.problem_type = "multi_label_classification"

	if self.config.problem_type == "regression":
	loss_fct = MSELoss()
	if self.num_labels == 1:
	loss = loss_fct(logits.squeeze(), labels.squeeze())
	else:
	loss = loss_fct(logits, labels)
	elif self.config.problem_type == "single_label_classification":
	loss_fct = CrossEntropyLoss()
	loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
	elif self.config.problem_type == "multi_label_classification":
	loss_fct = BCEWithLogitsLoss()
	loss = loss_fct(logits, labels)

	if not return_dict:
	output = (logits,) + outputs[2:]
	return ((loss,) + output) if loss is not None else output

	return ImageClassifierOutput(
	loss=loss,
	logits=logits,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	)