feat: jina clip v2 implementation

Browse files

Files changed (7) hide show

.gitignore +70 -0
configuration_clip.py +0 -6
eva_model.py +27 -27
hf_model.py +56 -85
modeling_clip.py +197 -156
processing_clip.py +0 -1
transform.py +95 -179

.gitignore ADDED Viewed

	@@ -0,0 +1,70 @@

+# Project specific
+__init__.py
+pyproject.toml
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# PyCharm
+.idea/

configuration_clip.py CHANGED Viewed

@@ -47,11 +47,9 @@ class JinaCLIPTextConfig(PretrainedConfig):
         configdict, kwargs = cls.get_config_dict(
             pretrained_model_name_or_path, **kwargs
         )
         # get the text config dict if we are loading from JinaCLIPConfig
         if configdict.get('model_type') == 'jina_clip':
             configdict = configdict['text_config']
         if (
             'model_type' in configdict
             and hasattr(cls, 'model_type')
@@ -62,7 +60,6 @@ class JinaCLIPTextConfig(PretrainedConfig):
                 f'instantiate a model of type {cls.model_type}. This is not supported '
                 'for all configurations of models and can yield errors.'
             )
         return cls.from_dict(configdict, **kwargs)
@@ -125,11 +122,9 @@ class JinaCLIPVisionConfig(PretrainedConfig):
         configdict, kwargs = cls.get_config_dict(
             pretrained_model_name_or_path, **kwargs
         )
         # get the vision config dict if we are loading from JinaCLIPConfig
         if configdict.get('model_type') == 'jina_clip':
             configdict = configdict['vision_config']
         if (
             'model_type' in configdict
             and hasattr(cls, 'model_type')
@@ -140,7 +135,6 @@ class JinaCLIPVisionConfig(PretrainedConfig):
                 f'instantiate a model of type {cls.model_type}. This is not supported '
                 'for all configurations of models and can yield errors.'
             )
         return cls.from_dict(configdict, **kwargs)

         configdict, kwargs = cls.get_config_dict(
             pretrained_model_name_or_path, **kwargs
         )
         # get the text config dict if we are loading from JinaCLIPConfig
         if configdict.get('model_type') == 'jina_clip':
             configdict = configdict['text_config']
         if (
             'model_type' in configdict
             and hasattr(cls, 'model_type')
                 f'instantiate a model of type {cls.model_type}. This is not supported '
                 'for all configurations of models and can yield errors.'
             )
         return cls.from_dict(configdict, **kwargs)
         configdict, kwargs = cls.get_config_dict(
             pretrained_model_name_or_path, **kwargs
         )
         # get the vision config dict if we are loading from JinaCLIPConfig
         if configdict.get('model_type') == 'jina_clip':
             configdict = configdict['vision_config']
         if (
             'model_type' in configdict
             and hasattr(cls, 'model_type')
                 f'instantiate a model of type {cls.model_type}. This is not supported '
                 'for all configurations of models and can yield errors.'
             )
         return cls.from_dict(configdict, **kwargs)

eva_model.py CHANGED Viewed

@@ -9,12 +9,12 @@ from functools import partial
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 try:
-    from timm.models.layers import drop_path, to_2tuple, trunc_normal_
 except ImportError or ModuleNotFoundError:
-    from timm.layers import drop_path, to_2tuple, trunc_normal_
 from .rope_embeddings import VisionRotaryEmbeddingFast
@@ -81,7 +81,7 @@ class DropPath(nn.Module):
         self.drop_prob = drop_prob
     def forward(self, x):
-        return drop_path(x, self.drop_prob, self.training)
     def extra_repr(self) -> str:
         return 'p={}'.format(self.drop_prob)
@@ -244,17 +244,17 @@ class Attention(nn.Module):
         self.rope = rope
     def forward(self, x, rel_pos_bias=None, attn_mask=None):
-        B, N, C = x.shape
         if self.subln:
-            q = F.linear(input=x, weight=self.q_proj.weight, bias=self.q_bias)
-            k = F.linear(input=x, weight=self.k_proj.weight, bias=None)
-            v = F.linear(input=x, weight=self.v_proj.weight, bias=self.v_bias)
-            q = q.reshape(B, N, self.num_heads, -1).permute(
                 0, 2, 1, 3
             )  # B, num_heads, N, C
-            k = k.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3)
-            v = v.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3)
         else:
             qkv_bias = None
             if self.q_bias is not None:
@@ -266,8 +266,8 @@ class Attention(nn.Module):
                     )
                 )
-            qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
-            qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(
                 2, 0, 3, 1, 4
             )  # 3, B, num_heads, N, C
             q, k, v = qkv[0], qkv[1], qkv[2]
@@ -298,7 +298,7 @@ class Attention(nn.Module):
                 p=self.xattn_drop,
                 scale=self.scale,
             )
-            x = x.reshape(B, N, -1)
             x = self.inner_attn_ln(x)
             x = self.proj(x)
             x = self.proj_drop(x)
@@ -329,7 +329,7 @@ class Attention(nn.Module):
             attn = attn.softmax(dim=-1)
             attn = self.attn_drop(attn)
-            x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
             x = self.inner_attn_ln(x)
             x = self.proj(x)
             x = self.proj_drop(x)
@@ -461,12 +461,12 @@ class PatchEmbed(nn.Module):
             in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
         )
-    def forward(self, x, **kwargs):
         target_dtype = self.proj.weight.dtype
-        B, C, H, W = x.shape
         # FIXME look at relaxing size constraints
-        assert H == self.img_size[0] and W == self.img_size[1], (
-            f"Input image size ({H}*{W}) doesn't match model "
             f'({self.img_size[0]}*{self.img_size[1]}).'
         )
         x = self.proj(x.to(dtype=target_dtype)).flatten(2).transpose(1, 2)
@@ -559,9 +559,8 @@ class EVAVisionTransformer(nn.Module):
         super().__init__()
         self.image_size = img_size
         self.num_classes = num_classes
-        self.num_features = (
-            self.embed_dim
-        ) = embed_dim  # num_features for consistency with other models
         self.patch_embed = PatchEmbed(
             img_size=img_size,
@@ -666,8 +665,8 @@ class EVAVisionTransformer(nn.Module):
         self.grad_checkpointing = grad_checkpointing
     def fix_init_weight(self):
-        def rescale(param, layer_id):
-            param.div_(math.sqrt(2.0 * layer_id))
         for layer_id, layer in enumerate(self.blocks):
             rescale(layer.attn.proj.weight.data, layer_id + 1)
@@ -679,7 +678,8 @@ class EVAVisionTransformer(nn.Module):
     def get_cast_dtype(self) -> torch.dtype:
         return self.blocks[0].mlp.fc2.weight.dtype
-    def _init_weights(self, m):
         if isinstance(m, nn.Linear):
             trunc_normal_(m.weight, std=0.02)
             if m.bias is not None:
@@ -691,7 +691,7 @@ class EVAVisionTransformer(nn.Module):
     def get_num_layers(self):
         return len(self.blocks)
-    def lock(self, unlocked_groups=0, freeze_bn_stats=False):
         assert (
             unlocked_groups == 0
         ), 'partial locking not currently supported for this model'
@@ -709,7 +709,7 @@ class EVAVisionTransformer(nn.Module):
     def get_classifier(self):
         return self.head
-    def reset_classifier(self, num_classes, global_pool=''):
         self.num_classes = num_classes
         self.head = (
             nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()

 import torch
 import torch.nn as nn
+import torch.nn.functional as f
 try:
+    from timm.models.layers import drop_path as timm_drop_path, to_2tuple, trunc_normal_
 except ImportError or ModuleNotFoundError:
+    from timm.layers import drop_path as timm_drop_path, to_2tuple, trunc_normal_
 from .rope_embeddings import VisionRotaryEmbeddingFast
         self.drop_prob = drop_prob
     def forward(self, x):
+        return timm_drop_path(x, self.drop_prob, self.training)
     def extra_repr(self) -> str:
         return 'p={}'.format(self.drop_prob)
         self.rope = rope
     def forward(self, x, rel_pos_bias=None, attn_mask=None):
+        b, n, _ = x.shape
         if self.subln:
+            q = f.linear(input=x, weight=self.q_proj.weight, bias=self.q_bias)
+            k = f.linear(input=x, weight=self.k_proj.weight, bias=None)
+            v = f.linear(input=x, weight=self.v_proj.weight, bias=self.v_bias)
+            q = q.reshape(b, n, self.num_heads, -1).permute(
                 0, 2, 1, 3
             )  # B, num_heads, N, C
+            k = k.reshape(b, n, self.num_heads, -1).permute(0, 2, 1, 3)
+            v = v.reshape(b, n, self.num_heads, -1).permute(0, 2, 1, 3)
         else:
             qkv_bias = None
             if self.q_bias is not None:
                     )
                 )
+            qkv = f.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+            qkv = qkv.reshape(b, n, 3, self.num_heads, -1).permute(
                 2, 0, 3, 1, 4
             )  # 3, B, num_heads, N, C
             q, k, v = qkv[0], qkv[1], qkv[2]
                 p=self.xattn_drop,
                 scale=self.scale,
             )
+            x = x.reshape(b, n, -1)
             x = self.inner_attn_ln(x)
             x = self.proj(x)
             x = self.proj_drop(x)
             attn = attn.softmax(dim=-1)
             attn = self.attn_drop(attn)
+            x = (attn @ v).transpose(1, 2).reshape(b, n, -1)
             x = self.inner_attn_ln(x)
             x = self.proj(x)
             x = self.proj_drop(x)
             in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
         )
+    def forward(self, x, **_):
         target_dtype = self.proj.weight.dtype
+        _, __, h, w = x.shape
         # FIXME look at relaxing size constraints
+        assert h == self.img_size[0] and w == self.img_size[1], (
+            f"Input image size ({h}*{w}) doesn't match model "
             f'({self.img_size[0]}*{self.img_size[1]}).'
         )
         x = self.proj(x.to(dtype=target_dtype)).flatten(2).transpose(1, 2)
         super().__init__()
         self.image_size = img_size
         self.num_classes = num_classes
+        # num_features for consistency with other models
+        self.num_features = self.embed_dim = embed_dim
         self.patch_embed = PatchEmbed(
             img_size=img_size,
         self.grad_checkpointing = grad_checkpointing
     def fix_init_weight(self):
+        def rescale(param, _layer_id):
+            param.div_(math.sqrt(2.0 * _layer_id))
         for layer_id, layer in enumerate(self.blocks):
             rescale(layer.attn.proj.weight.data, layer_id + 1)
     def get_cast_dtype(self) -> torch.dtype:
         return self.blocks[0].mlp.fc2.weight.dtype
+    @staticmethod
+    def _init_weights(m):
         if isinstance(m, nn.Linear):
             trunc_normal_(m.weight, std=0.02)
             if m.bias is not None:
     def get_num_layers(self):
         return len(self.blocks)
+    def lock(self, unlocked_groups=0, *_, **__):
         assert (
             unlocked_groups == 0
         ), 'partial locking not currently supported for this model'
     def get_classifier(self):
         return self.head
+    def reset_classifier(self, num_classes, *_, **__):
         self.num_classes = num_classes
         self.head = (
             nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()

hf_model.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import re
-from typing import Dict, Optional, Tuple
 import torch
 import torch.nn as nn
 from transformers import AutoConfig, AutoModel, PretrainedConfig
@@ -10,9 +9,6 @@ from transformers.modeling_outputs import (
     BaseModelOutputWithPoolingAndCrossAttentions,
 )
-"""
-HF architecture mapping
-"""
 _HF_ARCH_DICT = {
     # https://huggingface.co/docs/transformers/model_doc/roberta#roberta
@@ -41,22 +37,6 @@ _HF_ARCH_DICT = {
         },
         'pooler': 'mean_pooler',
     },
-    # https://huggingface.co/docs/transformers/model_doc/mt5#mt5
-    'mt5': {
-        'config_names': {
-            # unlimited seqlen
-            # https://github.com/google-research/text-to-text-transfer-transformer/issues/273
-            # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/t5/modeling_t5.py#L374
-            'context_length': '',
-            'vocab_size': 'vocab_size',
-            'width': 'd_model',
-            'heads': 'num_heads',
-            'layers': 'num_layers',
-            'layer_attr': 'block',
-            'token_embeddings_attr': 'embed_tokens',
-        },
-        'pooler': 'mean_pooler',
-    },
     # https://huggingface.co/docs/transformers/model_doc/bert
     'bert': {
         'config_names': {
@@ -68,24 +48,8 @@ _HF_ARCH_DICT = {
         },
         'pooler': 'cls_pooler',
     },
-    # https://huggingface.co/docs/transformers/model_doc/m2m_100
-    'm2m_100': {
-        'config_names': {
-            'context_length': 'max_position_embeddings',
-            'vocab_size': 'vocab_size',
-            'width': 'd_model',
-            'heads': 'encoder_attention_heads',
-            'layers': 'encoder_layers',
-        },
-        'pooler': 'cls_pooler',
-    },
 }
-"""
-Pooling functions
-"""
 _POOLERS = {}
@@ -101,8 +65,6 @@ def register_pooler(cls):
 @register_pooler
 class MeanPooler(nn.Module):
-    """Mean pooling"""
     @staticmethod
     def forward(x: BaseModelOutput, attention_mask: torch.Tensor):
         masked_output = x.last_hidden_state * attention_mask.unsqueeze(-1)
@@ -111,10 +73,6 @@ class MeanPooler(nn.Module):
 @register_pooler
 class MaxPooler(nn.Module):
-    """
-    Max pooling
-    """
     @staticmethod
     def forward(x: BaseModelOutput, attention_mask: torch.Tensor):
         masked_output = x.last_hidden_state.masked_fill(
@@ -125,11 +83,7 @@ class MaxPooler(nn.Module):
 @register_pooler
 class ClsPooler(nn.Module):
-    """
-    CLS token pooling
-    """
-    def __init__(self, use_pooler_output=True):
         super().__init__()
         self.cls_token_position = 0
         self.use_pooler_output = use_pooler_output
@@ -147,15 +101,9 @@ class ClsPooler(nn.Module):
             and (x.pooler_output is not None)
         ):
             return x.pooler_output
         return x.last_hidden_state[:, self.cls_token_position, :]
-"""
-HF text model
-"""
 class HFTextEncoder(nn.Module):
     output_tokens: torch.jit.Final[bool]
@@ -171,21 +119,21 @@ class HFTextEncoder(nn.Module):
         output_tokens: bool = False,
         trust_remote_code: bool = False,
         revision: Optional[str] = None,
         model_config_kwargs: Optional[Dict] = None,
     ):
         super().__init__()
         self.output_tokens = output_tokens
         self.output_dim = output_dim
-        # TODO: find better way to get this information
-        uses_transformer_pooler = pooler_type == 'cls_pooler'
         model_config_kwargs = model_config_kwargs or {}
         if config is None:
             self.config = AutoConfig.from_pretrained(
                 model_name_or_path,
                 trust_remote_code=trust_remote_code,
-                code_revision=revision,
             )
             self.config.update(model_config_kwargs)
             create_func, model_args = (
@@ -193,34 +141,40 @@ class HFTextEncoder(nn.Module):
                 if pretrained
                 else (AutoModel.from_config, self.config)
             )
-            # TODO: do all model configs have this attribute?
-            #  PretrainedConfig does so yes??
             if (
                 hasattr(self.config, 'is_encoder_decoder')
                 and self.config.is_encoder_decoder
             ):
-                self.transformer = create_func(model_args)
                 self.transformer = self.transformer.encoder
             else:
                 self.transformer = create_func(
                     model_args,
                     trust_remote_code=trust_remote_code,
-                    add_pooling_layer=uses_transformer_pooler,
-                    code_revision=revision,
                 )
         else:
             self.config = config
             self.config.update(model_config_kwargs)
-            self.transformer = AutoModel.from_config(self.config)
-        if pooler_type is None:  # get default arch pooler
-            pooler_type = _HF_ARCH_DICT[self.config.model_type]['pooler']
-        # FIXME downstream users of OpenCLIP models use these attr,
-        #  need to verify valid across all models
         self.vocab_size = getattr(self.config, 'vocab_size', 0)
         self.context_length = getattr(self.config, 'max_position_embeddings', 0)
         self.pooler = _POOLERS[pooler_type]()
         d_model = getattr(
@@ -228,7 +182,7 @@ class HFTextEncoder(nn.Module):
         )
         if (d_model == output_dim) and (proj_type is None):  # do we always need a proj?
             self.proj = nn.Identity()
-        elif proj_type == 'linear':
             self.proj = nn.Linear(d_model, output_dim, bias=proj_bias)
         elif proj_type == 'mlp':
             hidden_size = (d_model + output_dim) // 2
@@ -238,27 +192,52 @@ class HFTextEncoder(nn.Module):
                 nn.Linear(hidden_size, output_dim, bias=proj_bias),
             )
-    def forward(self, x: torch.Tensor):
         attn_mask = (x != self.config.pad_token_id).long()
-        out = self.transformer(input_ids=x, attention_mask=attn_mask)
         pooled_out = self.pooler(out, attn_mask)
         projected = self.proj(pooled_out)
-        seq_len = out.last_hidden_state.shape[1]
         tokens = (
             out.last_hidden_state[
-                :, torch.arange(seq_len) != self.pooler.cls_token_position, :
             ]
             if isinstance(self.pooler, ClsPooler)
             else out.last_hidden_state
         )
         if self.output_tokens:
             return projected, tokens
         return projected
     def lock(self, unlocked_layers: int = 0, freeze_layer_norm: bool = True):
-        if not unlocked_layers:  # full freezing
             for n, p in self.transformer.named_parameters():
                 p.requires_grad = (
                     (not freeze_layer_norm) if 'LayerNorm' in n.split('.') else False
@@ -287,11 +266,3 @@ class HFTextEncoder(nn.Module):
                 p.requires_grad = (
                     (not freeze_layer_norm) if 'LayerNorm' in n.split('.') else False
                 )
-    @torch.jit.ignore
-    def set_grad_checkpointing(self, _=True):
-        self.transformer.gradient_checkpointing_enable()
-    def init_parameters(self):
-        pass

 import re
+from typing import Dict, Optional
 import torch
 import torch.nn as nn
 from transformers import AutoConfig, AutoModel, PretrainedConfig
     BaseModelOutputWithPoolingAndCrossAttentions,
 )
 _HF_ARCH_DICT = {
     # https://huggingface.co/docs/transformers/model_doc/roberta#roberta
         },
         'pooler': 'mean_pooler',
     },
     # https://huggingface.co/docs/transformers/model_doc/bert
     'bert': {
         'config_names': {
         },
         'pooler': 'cls_pooler',
     },
 }
 _POOLERS = {}
 @register_pooler
 class MeanPooler(nn.Module):
     @staticmethod
     def forward(x: BaseModelOutput, attention_mask: torch.Tensor):
         masked_output = x.last_hidden_state * attention_mask.unsqueeze(-1)
 @register_pooler
 class MaxPooler(nn.Module):
     @staticmethod
     def forward(x: BaseModelOutput, attention_mask: torch.Tensor):
         masked_output = x.last_hidden_state.masked_fill(
 @register_pooler
 class ClsPooler(nn.Module):
+    def __init__(self, use_pooler_output: bool = True):
         super().__init__()
         self.cls_token_position = 0
         self.use_pooler_output = use_pooler_output
             and (x.pooler_output is not None)
         ):
             return x.pooler_output
         return x.last_hidden_state[:, self.cls_token_position, :]
 class HFTextEncoder(nn.Module):
     output_tokens: torch.jit.Final[bool]
         output_tokens: bool = False,
         trust_remote_code: bool = False,
         revision: Optional[str] = None,
+        code_revision: Optional[str] = None,
         model_config_kwargs: Optional[Dict] = None,
     ):
         super().__init__()
         self.output_tokens = output_tokens
         self.output_dim = output_dim
         model_config_kwargs = model_config_kwargs or {}
         if config is None:
             self.config = AutoConfig.from_pretrained(
                 model_name_or_path,
                 trust_remote_code=trust_remote_code,
+                revision=revision,
+                code_revision=code_revision,
             )
             self.config.update(model_config_kwargs)
             create_func, model_args = (
                 if pretrained
                 else (AutoModel.from_config, self.config)
             )
             if (
                 hasattr(self.config, 'is_encoder_decoder')
                 and self.config.is_encoder_decoder
             ):
+                self.transformer = create_func(
+                    model_args,
+                    trust_remote_code=trust_remote_code,
+                    revision=revision,
+                    code_revision=code_revision,
+                    **model_config_kwargs,
+                )
                 self.transformer = self.transformer.encoder
             else:
                 self.transformer = create_func(
                     model_args,
                     trust_remote_code=trust_remote_code,
+                    revision=revision,
+                    add_pooling_layer=False,
+                    code_revision=code_revision,
+                    **model_config_kwargs,
                 )
         else:
             self.config = config
             self.config.update(model_config_kwargs)
+            self.transformer = AutoModel.from_config(
+                self.config,
+                trust_remote_code=trust_remote_code,
+                revision=revision,
+                code_revision=code_revision,
+            )
         self.vocab_size = getattr(self.config, 'vocab_size', 0)
         self.context_length = getattr(self.config, 'max_position_embeddings', 0)
+        pooler_type = pooler_type or _HF_ARCH_DICT[self.config.model_type]['pooler']
         self.pooler = _POOLERS[pooler_type]()
         d_model = getattr(
         )
         if (d_model == output_dim) and (proj_type is None):  # do we always need a proj?
             self.proj = nn.Identity()
+        elif (d_model != output_dim) or proj_type == 'linear':
             self.proj = nn.Linear(d_model, output_dim, bias=proj_bias)
         elif proj_type == 'mlp':
             hidden_size = (d_model + output_dim) // 2
                 nn.Linear(hidden_size, output_dim, bias=proj_bias),
             )
+        self._task_instructions = {}
+        self._lora_adaptation_map = {}
+        self._supports_task_instructions = False
+        self._supports_lora = False
+        if (
+            hasattr(self.transformer, '_adaptation_map')
+            and len(self.transformer._adaptation_map) > 0
+        ):
+            self._lora_adaptation_map = self.transformer._adaptation_map
+            self._supports_lora = True
+        if (
+            hasattr(self.transformer, '_task_instructions')
+            and len(self.transformer._task_instructions) > 0
+        ):
+            self._task_instructions = self.transformer._task_instructions
+            self._supports_task_instructions = True
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, _=True):
+        self.transformer.gradient_checkpointing_enable()
+    def init_parameters(self):
+        pass
+    def forward(self, x: torch.Tensor, adapter_mask: Optional[torch.Tensor] = None):
         attn_mask = (x != self.config.pad_token_id).long()
+        kwargs = {}
+        if adapter_mask is not None:
+            kwargs['adapter_mask'] = adapter_mask
+        out = self.transformer(input_ids=x, attention_mask=attn_mask, **kwargs)
         pooled_out = self.pooler(out, attn_mask)
         projected = self.proj(pooled_out)
+        seqlen = out.last_hidden_state.shape[1]
         tokens = (
             out.last_hidden_state[
+                :, torch.arange(seqlen) != self.pooler.cls_token_position, :
             ]
             if isinstance(self.pooler, ClsPooler)
             else out.last_hidden_state
         )
         if self.output_tokens:
             return projected, tokens
         return projected
     def lock(self, unlocked_layers: int = 0, freeze_layer_norm: bool = True):
+        if not unlocked_layers:
             for n, p in self.transformer.named_parameters():
                 p.requires_grad = (
                     (not freeze_layer_norm) if 'LayerNorm' in n.split('.') else False
                 p.requires_grad = (
                     (not freeze_layer_norm) if 'LayerNorm' in n.split('.') else False
                 )

modeling_clip.py CHANGED Viewed

@@ -14,6 +14,7 @@ import requests
 import torch
 import torch.nn.functional as f
 import torch.utils.checkpoint
 from torch import nn
 from transformers import (
     AutoImageProcessor,
@@ -35,13 +36,12 @@ try:
     has_tqdm = True
 except ImportError:
     has_tqdm = False
 from .configuration_clip import JinaCLIPConfig, JinaCLIPTextConfig, JinaCLIPVisionConfig
 from .eva_model import EVAVisionTransformer
 from .hf_model import HFTextEncoder
-# needed for HF to correctly import in cache
 from .rope_embeddings import VisionRotaryEmbeddingFast  # noqa: F401
 from .transform import (  # noqa: F401
     OPENAI_DATASET_MEAN,
@@ -157,6 +157,9 @@ class JinaCLIPTextModel(JinaCLIPPreTrainedModel):
         self,
         input_ids: Union[None, torch.Tensor, BatchEncoding] = None,
         return_dict: Optional[bool] = None,
         *_,
         **__,
     ) -> Union[Tuple[Optional[torch.FloatTensor], ...], CLIPTextModelOutput]:
@@ -164,7 +167,12 @@ class JinaCLIPTextModel(JinaCLIPPreTrainedModel):
             return_dict if return_dict is not None else self.config.use_return_dict
         )
         x = input_ids.input_ids if isinstance(input_ids, BatchEncoding) else input_ids
-        feats = self.text_model(x=x)
         out = CLIPTextModelOutput(text_embeds=feats)
         return out if return_dict else out.to_tuple()
@@ -220,7 +228,9 @@ class JinaCLIPModel(JinaCLIPPreTrainedModel):
         vision_config = config.vision_config
         if config.use_text_flash_attn is not None:
-            text_config.hf_model_config_kwargs['use_flash_attn'] = config.use_text_flash_attn
         if config.use_vision_xformers is not None:
             vision_config.x_attention = config.use_vision_xformers
@@ -228,13 +238,11 @@ class JinaCLIPModel(JinaCLIPPreTrainedModel):
         self.projection_dim = config.projection_dim
         self.text_embed_dim = text_config.embed_dim
         self.vision_embed_dim = vision_config.embed_dim
         self.text_model = _build_text_tower(text_config)
         self.vision_model = _build_vision_tower(vision_config)
         self.logit_scale = nn.Parameter(
             torch.tensor(self.config.logit_scale_init_value)
         )
         if self.add_projections:
             self.visual_projection = nn.Linear(
                 self.vision_embed_dim, self.projection_dim, bias=False
@@ -267,11 +275,12 @@ class JinaCLIPModel(JinaCLIPPreTrainedModel):
     def get_text_features(
         self,
         input_ids: Union[None, torch.Tensor, BatchEncoding] = None,
         *_,
         **__,
     ) -> torch.FloatTensor:
         x = input_ids.input_ids if isinstance(input_ids, BatchEncoding) else input_ids
-        return self.text_projection(self.text_model(x=x))
     def get_image_features(
         self,
@@ -286,24 +295,24 @@ class JinaCLIPModel(JinaCLIPPreTrainedModel):
         )
         return self.visual_projection(self.vision_model(x=x))
-    def truncate_embeddings(self, embeddings, truncate_dim):
         if not self.config.matryoshka_dimensions:
             logger.warning(
-                "Matryoshka embeddings are not supported, so dimension truncation will not be performed."
-            )
-            return embeddings
-        elif truncate_dim in self.config.matryoshka_dimensions:
-            return embeddings[:, :truncate_dim]
-        else:
-            raise ValueError(
-                f"The provided `truncate_dim` value of {truncate_dim} is not supported. "
-                f"Supported dimensions are {self.config.matryoshka_dimensions}."
             )
     @torch.inference_mode()
-    def encode_text(
         self,
-        sentences: Union[str, List[str]],
         batch_size: int = 32,
         show_progress_bar: Optional[bool] = None,
         convert_to_numpy: bool = True,
@@ -311,122 +320,129 @@ class JinaCLIPModel(JinaCLIPPreTrainedModel):
         device: Optional[torch.device] = None,
         normalize_embeddings: bool = True,
         truncate_dim: Optional[int] = None,
-        **tokenizer_kwargs,
     ) -> Union[List[torch.Tensor], np.ndarray, torch.Tensor]:
         """
-        Computes sentence embeddings
-         Args:
-             sentences(`str` or `List[str]`):
-                 Sentence or sentences to be encoded
-             batch_size(`int`, *optional*, defaults to 32):
-                 Batch size for the computation
-             show_progress_bar(`bool`, *optional*, defaults to None):
-                 Show a progress bar when encoding sentences.
-                 If set to None, progress bar is only shown when
-                 `logger.level == logging.INFO` or `logger.level == logging.DEBUG`.
-             convert_to_numpy(`bool`, *optional*, defaults to True):
-                 If true, the output is a list of numpy vectors.
-                 Else, it is a list of pytorch tensors.
-             convert_to_tensor(`bool`, *optional*, defaults to False):
-                 If true, you get one large tensor as return.
-                 Overwrites any setting from convert_to_numpy
-             device(`torch.device`, *optional*, defaults to None):
-                 Which torch.device to use for the computation
-             normalize_embeddings(`bool`, *optional*, defaults to False):
-                 If set to true, returned vectors will have length 1. In that case,
-                 the faster dot-product (util.dot_score) instead of cosine similarity
-                 can be used.
-             truncate_dim(`int`, *optional*, defaults to None):
-                The dimension to truncate sentence embeddings to. `None` does no truncation.
-             tokenizer_kwargs(`Dict[str, Any]`, *optional*, defaults to {}):
-                 Keyword arguments for the tokenizer
-         Returns:
-             By default, a list of tensors is returned.
-             If convert_to_tensor, a stacked tensor is returned.
-             If convert_to_numpy, a numpy matrix is returned.
         """
-        is_training = self.training
         self.eval()
-        all_embeddings = []
-        self.tokenizer = self.get_tokenizer()
         if show_progress_bar is None:
             show_progress_bar = (
                 logger.getEffectiveLevel() == logging.INFO
                 or logger.getEffectiveLevel() == logging.DEBUG
             )
         if convert_to_tensor:
             convert_to_numpy = False
-        input_was_string = False
-        if isinstance(sentences, str) or not hasattr(sentences, '__len__'):
-            sentences = [sentences]
-            input_was_string = True
         if device is not None:
             self.to(device)
-        permutation = np.argsort([-len(i) for i in sentences])
-        inverse_permutation = np.argsort(permutation)
-        sentences = [sentences[idx] for idx in permutation]
-        tokenizer_kwargs['padding'] = tokenizer_kwargs.get('padding', True)
-        tokenizer_kwargs['max_length'] = tokenizer_kwargs.get('max_length', 512)
-        tokenizer_kwargs['truncation'] = tokenizer_kwargs.get('truncation', True)
         if has_tqdm:
             range_iter = trange(
                 0,
-                len(sentences),
                 batch_size,
                 desc='Encoding',
                 disable=not show_progress_bar,
             )
         else:
-            range_iter = range(0, len(sentences), batch_size)
         truncate_dim = truncate_dim or self.config.truncate_dim
         for i in range_iter:
-            encoded_input = self.tokenizer(
-                sentences[i : i + batch_size],
-                return_tensors='pt',
-                **tokenizer_kwargs,
-            ).to(self.device)
-            embeddings = self.get_text_features(input_ids=encoded_input)
             if truncate_dim:
                 embeddings = self.truncate_embeddings(embeddings, truncate_dim)
             if normalize_embeddings:
-                embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
             if convert_to_numpy:
                 embeddings = embeddings.cpu()
             all_embeddings.extend(embeddings)
-        all_embeddings = [all_embeddings[idx] for idx in inverse_permutation]
         if convert_to_tensor:
             all_embeddings = torch.stack(all_embeddings)
         elif convert_to_numpy:
-            all_embeddings = np.asarray([emb.to(torch.float32).numpy() for emb in all_embeddings])
-        if input_was_string:
             all_embeddings = all_embeddings[0]
-        self.train(is_training)
         return all_embeddings
-    def decode_data_image(data_image_str):
-        header, data = data_image_str.split(',', 1)
-        image_data = base64.b64decode(data)
-        return Image.open(BytesIO(image_data))
     @torch.inference_mode()
-    def encode_image(
         self,
-        images: Union[str, List[Union[str, "Image.Image"]]],
         batch_size: int = 32,
         show_progress_bar: Optional[bool] = None,
         convert_to_numpy: bool = True,
@@ -434,129 +450,153 @@ class JinaCLIPModel(JinaCLIPPreTrainedModel):
         device: Optional[torch.device] = None,
         normalize_embeddings: bool = True,
         truncate_dim: Optional[int] = None,
     ) -> Union[List[torch.Tensor], np.ndarray, torch.Tensor]:
         """
-        Computes image embeddings.
         Args:
-            images(`str` or `List[Union[str, Image.Image]]`):
-                image paths, URLs, PIL images, or data:image/ strings to be encoded
             batch_size(`int`, *optional*, defaults to 32):
                 Batch size for the computation
             show_progress_bar(`bool`, *optional*, defaults to None):
-                Show a progress bar when encoding images.
-                If set to None, progress bar is only shown when
-                `logger.level == logging.INFO` or `logger.level == logging.DEBUG`.
             convert_to_numpy(`bool`, *optional*, defaults to True):
-                If true, the output is a list of numpy vectors.
-                Else, it is a list of pytorch tensors.
             convert_to_tensor(`bool`, *optional*, defaults to False):
-                If true, you get one large tensor as return.
-                Overwrites any setting from convert_to_numpy
             device(`torch.device`, *optional*, defaults to None):
                 Which torch.device to use for the computation
             normalize_embeddings(`bool`, *optional*, defaults to False):
                 If set to true, returned vectors will have length 1. In that case,
                 the faster dot-product (util.dot_score) instead of cosine similarity
-                can be used.
             truncate_dim(`int`, *optional*, defaults to None):
-                The dimension to truncate sentence embeddings to. `None` does no truncation.
         Returns:
-            By default, a list of tensors is returned.
-            If convert_to_tensor, a stacked tensor is returned.
-            If convert_to_numpy, a numpy matrix is returned.
         """
-        is_training = self.training
         self.eval()
-        self.preprocess = self.get_preprocess()
         all_embeddings = []
         if show_progress_bar is None:
             show_progress_bar = (
                 logger.getEffectiveLevel() == logging.INFO
                 or logger.getEffectiveLevel() == logging.DEBUG
             )
         if convert_to_tensor:
             convert_to_numpy = False
-        input_was_single_img = False
-        if isinstance(images, str) or not hasattr(images, '__len__'):
-            images = [images]
-            input_was_single_img = True
         if device is not None:
             self.to(device)
-        permutation = np.argsort([-len(str(i)) for i in images])
-        inverse_permutation = np.argsort(permutation)
-        images = [images[idx] for idx in permutation]
         if has_tqdm:
             range_iter = trange(
                 0,
-                len(images),
                 batch_size,
                 desc='Encoding',
                 disable=not show_progress_bar,
             )
         else:
-            range_iter = range(0, len(images), batch_size)
-        from PIL import Image
         truncate_dim = truncate_dim or self.config.truncate_dim
         for i in range_iter:
-            batch_images = images[i:i+batch_size]
-            processed_inputs = []
-            for img in batch_images:
-                if isinstance(img, str):
-                    if img.startswith('http'):
-                        response = requests.get(img)
-                        image = Image.open(BytesIO(response.content)).convert('RGB')
-                    elif img.startswith('data:image/'):
-                        image = decode_data_image(img).convert('RGB')
-                    else:
-                        image = Image.open(img).convert('RGB')
-                elif isinstance(img, Image.Image):
-                    image = img.convert('RGB')
-                else:
-                    raise ValueError("Unsupported image format")
-                processed_inputs.append(image)
-            processed_inputs = self.preprocess(processed_inputs)
-            processed_inputs = processed_inputs.to(self.device)
-            embeddings = self.get_image_features(processed_inputs)
             if truncate_dim:
                 embeddings = self.truncate_embeddings(embeddings, truncate_dim)
             if normalize_embeddings:
-                embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
             if convert_to_numpy:
                 embeddings = embeddings.cpu()
             all_embeddings.extend(embeddings)
-        all_embeddings = [all_embeddings[idx] for idx in inverse_permutation]
         if convert_to_tensor:
             all_embeddings = torch.stack(all_embeddings)
         elif convert_to_numpy:
-            all_embeddings = np.asarray([emb.to(torch.float32).numpy() for emb in all_embeddings])
-        if input_was_single_img:
             all_embeddings = all_embeddings[0]
-        self.train(is_training)
         return all_embeddings
     def forward(
         self,
         input_ids: Union[None, torch.Tensor, BatchEncoding] = None,
         pixel_values: Union[None, torch.FloatTensor, BatchFeature] = None,
         return_dict: Optional[bool] = None,
         return_loss: Optional[bool] = None,
         *_,
@@ -566,8 +606,9 @@ class JinaCLIPModel(JinaCLIPPreTrainedModel):
             return_dict if return_dict is not None else self.config.use_return_dict
         )
         image_embeds = self.get_image_features(pixel_values=pixel_values)
-        text_embeds = self.get_text_features(input_ids=input_ids)
         # normalized features
         image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
         text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)

 import torch
 import torch.nn.functional as f
 import torch.utils.checkpoint
+from PIL import Image
 from torch import nn
 from transformers import (
     AutoImageProcessor,
     has_tqdm = True
 except ImportError:
+    trange = None
     has_tqdm = False
 from .configuration_clip import JinaCLIPConfig, JinaCLIPTextConfig, JinaCLIPVisionConfig
 from .eva_model import EVAVisionTransformer
 from .hf_model import HFTextEncoder
 from .rope_embeddings import VisionRotaryEmbeddingFast  # noqa: F401
 from .transform import (  # noqa: F401
     OPENAI_DATASET_MEAN,
         self,
         input_ids: Union[None, torch.Tensor, BatchEncoding] = None,
         return_dict: Optional[bool] = None,
+        use_lora: bool = False,
+        adapter_mask: Optional[torch.Tensor] = None,
+        task: Optional[str] = None,
         *_,
         **__,
     ) -> Union[Tuple[Optional[torch.FloatTensor], ...], CLIPTextModelOutput]:
             return_dict if return_dict is not None else self.config.use_return_dict
         )
         x = input_ids.input_ids if isinstance(input_ids, BatchEncoding) else input_ids
+        feats = self.text_model(
+            x=x,
+            use_lora=use_lora,
+            adapter_mask=adapter_mask,
+            task=task,
+        )
         out = CLIPTextModelOutput(text_embeds=feats)
         return out if return_dict else out.to_tuple()
         vision_config = config.vision_config
         if config.use_text_flash_attn is not None:
+            text_config.hf_model_config_kwargs['use_flash_attn'] = (
+                config.use_text_flash_attn
+            )
         if config.use_vision_xformers is not None:
             vision_config.x_attention = config.use_vision_xformers
         self.projection_dim = config.projection_dim
         self.text_embed_dim = text_config.embed_dim
         self.vision_embed_dim = vision_config.embed_dim
         self.text_model = _build_text_tower(text_config)
         self.vision_model = _build_vision_tower(vision_config)
         self.logit_scale = nn.Parameter(
             torch.tensor(self.config.logit_scale_init_value)
         )
         if self.add_projections:
             self.visual_projection = nn.Linear(
                 self.vision_embed_dim, self.projection_dim, bias=False
     def get_text_features(
         self,
         input_ids: Union[None, torch.Tensor, BatchEncoding] = None,
+        adapter_mask: Optional[torch.Tensor] = None,
         *_,
         **__,
     ) -> torch.FloatTensor:
         x = input_ids.input_ids if isinstance(input_ids, BatchEncoding) else input_ids
+        return self.text_projection(self.text_model(x=x, adapter_mask=adapter_mask))
     def get_image_features(
         self,
         )
         return self.visual_projection(self.vision_model(x=x))
+    def _truncate_embeddings(self, embeddings: torch.Tensor, truncate_dim: int):
         if not self.config.matryoshka_dimensions:
             logger.warning(
+                'Model is not trained using Matryoshka Representation Learning, '
+                'truncating embeddings will not work optimally.'
             )
+        return embeddings[:, :truncate_dim]
+    @staticmethod
+    def _decode_image_data(image_data_str: str) -> Image:
+        header, data = image_data_str.split(',', 1)
+        image_data = base64.b64decode(data)
+        return Image.open(BytesIO(image_data))
     @torch.inference_mode()
+    def encode_image(
         self,
+        images: Union[str, List[Union[str, 'Image.Image']]],
         batch_size: int = 32,
         show_progress_bar: Optional[bool] = None,
         convert_to_numpy: bool = True,
         device: Optional[torch.device] = None,
         normalize_embeddings: bool = True,
         truncate_dim: Optional[int] = None,
     ) -> Union[List[torch.Tensor], np.ndarray, torch.Tensor]:
         """
+        Computes image embeddings
+        Args:
+            images(`str` or `List[Union[str, Image.Image]]`):
+                Image paths, URLs, PIL images, or data:image/ strings to be encoded
+            batch_size(`int`, *optional*, defaults to 32):
+                Batch size for the computation
+            show_progress_bar(`bool`, *optional*, defaults to None):
+                Show a progress bar when encoding images. If set to None, progress bar
+                is only shown when `logger.level == logging.INFO` or
+                `logger.level == logging.DEBUG`
+            convert_to_numpy(`bool`, *optional*, defaults to True):
+                If true, the output is a list of numpy vectors. Else, it is a list of
+                pytorch tensors
+            convert_to_tensor(`bool`, *optional*, defaults to False):
+                If true, you get one large tensor as return. Overwrites any setting
+                from convert_to_numpy
+            device(`torch.device`, *optional*, defaults to None):
+                Which torch.device to use for the computation
+            normalize_embeddings(`bool`, *optional*, defaults to False):
+                If set to true, returned vectors will have length 1. In that case,
+                the faster dot-product (util.dot_score) instead of cosine similarity
+                can be used
+            truncate_dim(`int`, *optional*, defaults to None):
+                The dimension to truncate sentence embeddings to. If set to `None`
+                no truncation is performed
+        Returns:
+            By default, a list of tensors is returned. If convert_to_tensor, a stacked
+            tensor is returned. If convert_to_numpy, a numpy matrix is returned
         """
+        _is_training = self.training
         self.eval()
+        self.preprocess = self.get_preprocess()
+        all_embeddings = []
         if show_progress_bar is None:
             show_progress_bar = (
                 logger.getEffectiveLevel() == logging.INFO
                 or logger.getEffectiveLevel() == logging.DEBUG
             )
         if convert_to_tensor:
             convert_to_numpy = False
+        _input_was_single_img = False
+        if isinstance(images, str) or not hasattr(images, '__len__'):
+            images = [images]
+            _input_was_single_img = True
         if device is not None:
             self.to(device)
+        _permutation = np.argsort([-len(str(i)) for i in images])
+        _inverse_permutation = np.argsort(_permutation)
+        images = [images[idx] for idx in _permutation]
         if has_tqdm:
             range_iter = trange(
                 0,
+                len(images),
                 batch_size,
                 desc='Encoding',
                 disable=not show_progress_bar,
             )
         else:
+            range_iter = range(0, len(images), batch_size)
         truncate_dim = truncate_dim or self.config.truncate_dim
         for i in range_iter:
+            _processed_images = []
+            for img in images[i: i + batch_size]:
+                if isinstance(img, str):
+                    if img.startswith('http'):
+                        response = requests.get(img)
+                        image = Image.open(BytesIO(response.content)).convert('RGB')
+                    elif img.startswith('data:image/'):
+                        image = self._decode_image_data(img).convert('RGB')
+                    else:
+                        image = Image.open(img).convert('RGB')
+                elif isinstance(img, Image.Image):
+                    image = img.convert('RGB')
+                else:
+                    raise ValueError('Unsupported image format')
+                _processed_images.append(image)
+            pixelvals = self.preprocess(_processed_images)
+            pixelvals = pixelvals.to(self.device)
+            embeddings = self.get_image_features(pixelvals)
             if truncate_dim:
                 embeddings = self.truncate_embeddings(embeddings, truncate_dim)
             if normalize_embeddings:
+                embeddings = f.normalize(embeddings, p=2, dim=1)
             if convert_to_numpy:
                 embeddings = embeddings.cpu()
             all_embeddings.extend(embeddings)
+        all_embeddings = [all_embeddings[idx] for idx in _inverse_permutation]
         if convert_to_tensor:
             all_embeddings = torch.stack(all_embeddings)
         elif convert_to_numpy:
+            all_embeddings = np.asarray(
+                [emb.to(torch.float32).numpy() for emb in all_embeddings]
+            )
+        if _input_was_single_img:
             all_embeddings = all_embeddings[0]
+        self.train(_is_training)
         return all_embeddings
     @torch.inference_mode()
+    def encode_text(
         self,
+        sentences: Union[str, List[str]],
+        task: Optional[str] = None,
         batch_size: int = 32,
         show_progress_bar: Optional[bool] = None,
         convert_to_numpy: bool = True,
         device: Optional[torch.device] = None,
         normalize_embeddings: bool = True,
         truncate_dim: Optional[int] = None,
+        **tokenizer_kwargs,
     ) -> Union[List[torch.Tensor], np.ndarray, torch.Tensor]:
         """
+        Computes text embeddings
         Args:
+            sentences(`str` or `List[str]`):
+                Sentence or sentences to be encoded
+            task(`str`, *optional*, defaults to `None`):
+                Specifies the task for which the encoding is intended. If `task` is
+                not provided, all LoRA adapters are disabled, and the model reverts
+                to its original, general-purpose weights
             batch_size(`int`, *optional*, defaults to 32):
                 Batch size for the computation
             show_progress_bar(`bool`, *optional*, defaults to None):
+                Show a progress bar when encoding sentences. If set to None, progress
+                bar is only shown when `logger.level == logging.INFO` or
+                `logger.level == logging.DEBUG`
             convert_to_numpy(`bool`, *optional*, defaults to True):
+                If true, the output is a list of numpy vectors. Else, it is a list of
+                pytorch tensors
             convert_to_tensor(`bool`, *optional*, defaults to False):
+                If true, you get one large tensor as return. Overwrites any setting
+                from convert_to_numpy
             device(`torch.device`, *optional*, defaults to None):
                 Which torch.device to use for the computation
             normalize_embeddings(`bool`, *optional*, defaults to False):
                 If set to true, returned vectors will have length 1. In that case,
                 the faster dot-product (util.dot_score) instead of cosine similarity
+                can be used
             truncate_dim(`int`, *optional*, defaults to None):
+                The dimension to truncate sentence embeddings to. If set to `None`
+                no truncation is performed
+            tokenizer_kwargs(`Dict[str, Any]`, *optional*, defaults to {}):
+                Keyword arguments for the tokenizer
         Returns:
+            By default, a list of tensors is returned. If convert_to_tensor, a stacked
+            tensor is returned. If convert_to_numpy, a numpy matrix is returned.
         """
+        _is_training = self.training
         self.eval()
         all_embeddings = []
+        self.tokenizer = self.get_tokenizer()
         if show_progress_bar is None:
             show_progress_bar = (
                 logger.getEffectiveLevel() == logging.INFO
                 or logger.getEffectiveLevel() == logging.DEBUG
             )
         if convert_to_tensor:
             convert_to_numpy = False
+        _input_was_string = False
+        if isinstance(sentences, str) or not hasattr(sentences, '__len__'):
+            sentences = [sentences]
+            _input_was_string = True
         if device is not None:
             self.to(device)
+        _permutation = np.argsort([-len(i) for i in sentences])
+        _inverse_permutation = np.argsort(_permutation)
+        sentences = [sentences[idx] for idx in _permutation]
+        tokenizer_kwargs['padding'] = tokenizer_kwargs.get('padding', True)
+        tokenizer_kwargs['max_length'] = tokenizer_kwargs.get('max_length', 512)
+        tokenizer_kwargs['truncation'] = tokenizer_kwargs.get('truncation', True)
         if has_tqdm:
             range_iter = trange(
                 0,
+                len(sentences),
                 batch_size,
                 desc='Encoding',
                 disable=not show_progress_bar,
             )
         else:
+            range_iter = range(0, len(sentences), batch_size)
         truncate_dim = truncate_dim or self.config.truncate_dim
+        adapter_mask = None
+        if task:
+            if not self.text_model._supports_lora:
+                logger.warning('Text tower does not support LoRA task adaptation')
+            elif task not in self.text_model._lora_adaptation_map:
+                raise ValueError(
+                    f'Unsupported task \'{task}\'. Choose one of the following: '
+                    f'{", ".join(self.text_model._lora_adaptation_map)} or bypass the '
+                    '`task` argument to disable LoRA completely.'
+                )
+            else:
+                taskid = self.text_model._lora_adaptation_map[task]
+                nexamples = 1 if isinstance(sentences, str) else len(sentences)
+                adapter_mask = torch.full(
+                    (nexamples,), taskid, dtype=torch.int32, device=self.device
+                )
+            if not self.text_model._supports_task_instructions:
+                logger.warning('Text tower does not support task instructions')
+            elif task not in self.text_model._task_instructions:
+                raise ValueError(
+                    f'Unsupported task \'{task}\'. Choose one of the following: '
+                    f'{", ".join(self.text_model._task_instructions)} or bypass the '
+                    '`task` argument to disable task instructions completely.'
+                )
+            else:
+                instruction = self.text_model._task_instructions[task]
+                sentences = [instruction + sentence for sentence in sentences]
         for i in range_iter:
+            tokens = self.tokenizer(
+                sentences[i: i + batch_size],
+                return_tensors='pt',
+                **tokenizer_kwargs,
+            ).to(self.device)
+            embeddings = self.get_text_features(
+                input_ids=tokens, adapter_mask=adapter_mask
+            )
             if truncate_dim:
                 embeddings = self.truncate_embeddings(embeddings, truncate_dim)
             if normalize_embeddings:
+                embeddings = f.normalize(embeddings, p=2, dim=1)
             if convert_to_numpy:
                 embeddings = embeddings.cpu()
             all_embeddings.extend(embeddings)
+        all_embeddings = [all_embeddings[idx] for idx in _inverse_permutation]
         if convert_to_tensor:
             all_embeddings = torch.stack(all_embeddings)
         elif convert_to_numpy:
+            all_embeddings = np.asarray(
+                [emb.to(torch.float32).numpy() for emb in all_embeddings]
+            )
+        if _input_was_string:
             all_embeddings = all_embeddings[0]
+        self.train(_is_training)
         return all_embeddings
     def forward(
         self,
         input_ids: Union[None, torch.Tensor, BatchEncoding] = None,
         pixel_values: Union[None, torch.FloatTensor, BatchFeature] = None,
+        adapter_mask: Optional[torch.Tensor] = None,
         return_dict: Optional[bool] = None,
         return_loss: Optional[bool] = None,
         *_,
             return_dict if return_dict is not None else self.config.use_return_dict
         )
         image_embeds = self.get_image_features(pixel_values=pixel_values)
+        text_embeds = self.get_text_features(
+            input_ids=input_ids, adapter_mask=adapter_mask
+        )
         # normalized features
         image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
         text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)

processing_clip.py CHANGED Viewed

@@ -72,7 +72,6 @@ class JinaCLIPImageProcessor(BaseImageProcessor):
         return output
     def preprocess(self, images: ImageInput, **kwargs) -> BatchFeature:
         _transform_needs_rebuild = False
         for k, v in kwargs.items():
             if k in self._valid_processor_keys:

         return output
     def preprocess(self, images: ImageInput, **kwargs) -> BatchFeature:
         _transform_needs_rebuild = False
         for k, v in kwargs.items():
             if k in self._valid_processor_keys:

transform.py CHANGED Viewed

@@ -1,11 +1,10 @@
-import numbers
 import random
 import warnings
 from dataclasses import asdict, dataclass
 from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 import torch
-import torchvision.transforms.functional as F
 from torchvision.transforms import (
     CenterCrop,
     ColorJitter,
@@ -23,88 +22,93 @@ OPENAI_DATASET_MEAN = tuple(OPENAI_CLIP_MEAN)
 OPENAI_DATASET_STD = tuple(OPENAI_CLIP_STD)
-@dataclass
-class PreprocessCfg:
-    size: Union[int, Tuple[int, int]] = 224
-    mode: str = 'RGB'
-    mean: Tuple[float, ...] = OPENAI_DATASET_MEAN
-    std: Tuple[float, ...] = OPENAI_DATASET_STD
-    interpolation: str = 'bicubic'
-    resize_mode: str = 'shortest'
-    fill_color: int = 0
-    def __post_init__(self):
-        assert self.mode in ('RGB',)
-    @property
-    def num_channels(self):
-        return 3
-    @property
-    def input_size(self):
-        return (self.num_channels,) + (self.size, self.size)
-_PREPROCESS_KEYS = set(asdict(PreprocessCfg()).keys())
-def merge_preprocess_dict(
-    base: Union[PreprocessCfg, Dict],
-    overlay: Dict,
-):
-    """Merge overlay key-value pairs on top of base preprocess cfg or dict.
-    Input dicts are filtered based on PreprocessCfg fields.
     """
-    if isinstance(base, PreprocessCfg):
-        base_clean = asdict(base)
-    else:
-        base_clean = {k: v for k, v in base.items() if k in _PREPROCESS_KEYS}
-    if overlay:
-        overlay_clean = {
-            k: v for k, v in overlay.items() if k in _PREPROCESS_KEYS and v is not None
-        }
-        base_clean.update(overlay_clean)
-    return base_clean
-def merge_preprocess_kwargs(base: Union[PreprocessCfg, Dict], **kwargs):
-    return merge_preprocess_dict(base, kwargs)
-@dataclass
-class AugmentationCfg:
-    scale: Tuple[float, float] = (0.9, 1.0)
-    ratio: Optional[Tuple[float, float]] = None
-    color_jitter: Optional[
-        Union[float, Tuple[float, float, float], Tuple[float, float, float, float]]
-    ] = None
-    re_prob: Optional[float] = None
-    re_count: Optional[int] = None
-    use_timm: bool = False
-    # params for simclr_jitter_gray
-    color_jitter_prob: float = None
-    gray_scale_prob: float = None
-def _setup_size(size, error_msg):
-    if isinstance(size, numbers.Number):
-        return int(size), int(size)
-    if isinstance(size, Sequence) and len(size) == 1:
-        return size[0], size[0]
-    if len(size) != 2:
-        raise ValueError(error_msg)
-    return size
-class ResizeKeepRatio:
-    """Resize and Keep Ratio
-    Copy & paste from `timm`
-    """
     def __init__(
         self,
@@ -159,8 +163,9 @@ class ResizeKeepRatio:
                 ratio_factor[0] / aspect_factor,
                 ratio_factor[1] * aspect_factor,
             )
-        size = [round(x * f / ratio) for x, f in zip(source_size, ratio_factor)]
-        return size
     def __call__(self, img):
         """
@@ -180,7 +185,7 @@ class ResizeKeepRatio:
             self.random_aspect_prob,
             self.random_aspect_range,
         )
-        img = F.resize(img, size, self.interpolation)
         return img
     def __repr__(self):
@@ -190,92 +195,8 @@ class ResizeKeepRatio:
         return format_string
-def center_crop_or_pad(
-    img: torch.Tensor, output_size: List[int], fill=0
-) -> torch.Tensor:
-    """Center crops and/or pads the given image.
-    If the image is torch Tensor, it is expected
-    to have [..., H, W] shape, where ... means an arbitrary number of leading
-    dimensions. If image size is smaller than output size along any edge, image is
-    padded with 0 and then center cropped.
-    Args:
-        img (PIL Image or Tensor): Image to be cropped.
-        output_size (sequence or int): (height, width) of the crop box. If int or
-        sequence with single int, it is used for both directions.
-        fill (int, Tuple[int]): Padding color
-    Returns:
-        PIL Image or Tensor: Cropped image.
-    """
-    if isinstance(output_size, numbers.Number):
-        output_size = (int(output_size), int(output_size))
-    elif isinstance(output_size, (tuple, list)) and len(output_size) == 1:
-        output_size = (output_size[0], output_size[0])
-    _, image_height, image_width = F.get_dimensions(img)
-    crop_height, crop_width = output_size
-    if crop_width > image_width or crop_height > image_height:
-        padding_ltrb = [
-            (crop_width - image_width) // 2 if crop_width > image_width else 0,
-            (crop_height - image_height) // 2 if crop_height > image_height else 0,
-            (crop_width - image_width + 1) // 2 if crop_width > image_width else 0,
-            (crop_height - image_height + 1) // 2 if crop_height > image_height else 0,
-        ]
-        img = F.pad(img, padding_ltrb, fill=fill)
-        _, image_height, image_width = F.get_dimensions(img)
-        if crop_width == image_width and crop_height == image_height:
-            return img
-    crop_top = int(round((image_height - crop_height) / 2.0))
-    crop_left = int(round((image_width - crop_width) / 2.0))
-    return F.crop(img, crop_top, crop_left, crop_height, crop_width)
-class CenterCropOrPad(torch.nn.Module):
-    """Crops the given image at the center.
-    If the image is torch Tensor, it is expected
-    to have [..., H, W] shape, where ... means an arbitrary number of leading
-    dimensions. If image size is smaller than output size along any edge, image is
-    padded with 0 and then center cropped.
-    Args:
-        size (sequence or int): Desired output size of the crop. If size is an
-            int instead of sequence like (h, w), a square crop (size, size) is
-            made. If provided a sequence of length 1, it will be interpreted as
-            (size[0], size[0]).
-    """
-    def __init__(self, size, fill=0):
-        super().__init__()
-        self.size = _setup_size(
-            size, error_msg='Please provide only two dimensions (h, w) for size.'
-        )
-        self.fill = fill
-    def forward(self, img):
-        """
-        Args:
-            img (PIL Image or Tensor): Image to be cropped.
-        Returns:
-            PIL Image or Tensor: Cropped image.
-        """
-        return center_crop_or_pad(img, self.size, fill=self.fill)
-    def __repr__(self) -> str:
-        return f'{self.__class__.__name__}(size={self.size})'
-def _convert_to_rgb(image):
-    return image.convert('RGB')
 class _ColorJitter(object):
-    """
-    Apply Color Jitter to the PIL image with a specified probability.
-    """
     def __init__(self, brightness=0.0, contrast=0.0, saturation=0.0, hue=0.0, p=0.8):
         assert 0.0 <= p <= 1.0
@@ -292,9 +213,7 @@ class _ColorJitter(object):
 class _GrayScale(object):
-    """
-    Apply Gray Scale to the PIL image with a specified probability.
-    """
     def __init__(self, p=0.2):
         assert 0.0 <= p <= 1.0
@@ -308,6 +227,20 @@ class _GrayScale(object):
             return img
 def image_transform(
     image_size: Union[int, Tuple[int, int]],
     is_train: bool,
@@ -407,10 +340,10 @@ def image_transform(
     else:
         if resize_mode == 'longest':
             transforms = [
-                ResizeKeepRatio(
                     image_size, interpolation=interpolation_mode, longest=1
                 ),
-                CenterCropOrPad(image_size, fill=fill_color),
             ]
         elif resize_mode == 'squash':
             if isinstance(image_size, int):
@@ -428,7 +361,7 @@ def image_transform(
                 transforms = [Resize(image_size[0], interpolation=interpolation_mode)]
             else:
                 # resize shortest edge to matching target dim for non-square target
-                transforms = [ResizeKeepRatio(image_size)]
             transforms += [CenterCrop(image_size)]
         transforms.extend(
@@ -439,20 +372,3 @@ def image_transform(
             ]
         )
         return Compose(transforms)
-def image_transform_v2(
-    cfg: PreprocessCfg,
-    is_train: bool,
-    aug_cfg: Optional[Union[Dict[str, Any], AugmentationCfg]] = None,
-):
-    return image_transform(
-        image_size=cfg.size,
-        is_train=is_train,
-        mean=cfg.mean,
-        std=cfg.std,
-        interpolation=cfg.interpolation,
-        resize_mode=cfg.resize_mode,
-        fill_color=cfg.fill_color,
-        aug_cfg=aug_cfg,
-    )

 import random
 import warnings
 from dataclasses import asdict, dataclass
 from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 import torch
+import torchvision.transforms.functional as f
 from torchvision.transforms import (
     CenterCrop,
     ColorJitter,
 OPENAI_DATASET_STD = tuple(OPENAI_CLIP_STD)
+def _setup_size(size, error_msg):
+    if isinstance(size, int):
+        return size, size
+    if isinstance(size, Sequence) and len(size) == 1:
+        return size[0], size[0]
+    if len(size) != 2:
+        raise ValueError(error_msg)
+    return size
+def _center_crop_or_pad(
+    img: torch.Tensor,
+    output_size: Union[int, Tuple[int, ...], List[int]],
+    fill: Union[int, Tuple[int]] = 0,
+) -> torch.Tensor:
     """
+    Center crops and/or pads the given image. If the image is torch Tensor, it is
+    expected to have [..., H, W] shape, where ... means an arbitrary number of leading
+    dimensions. If image size is smaller than output size along any edge, image is
+    padded with 0 and then center cropped.
+    """
+    if isinstance(output_size, int):
+        output_size = (output_size, output_size)
+    elif isinstance(output_size, (tuple, list)) and len(output_size) == 1:
+        output_size = (output_size[0], output_size[0])
+    _, image_height, image_width = f.get_dimensions(img)
+    crop_height, crop_width = output_size
+    if crop_width > image_width or crop_height > image_height:
+        padding_ltrb = [
+            (crop_width - image_width) // 2 if crop_width > image_width else 0,
+            (crop_height - image_height) // 2 if crop_height > image_height else 0,
+            (crop_width - image_width + 1) // 2 if crop_width > image_width else 0,
+            (crop_height - image_height + 1) // 2 if crop_height > image_height else 0,
+        ]
+        img = f.pad(img, padding_ltrb, fill=fill)
+        _, image_height, image_width = f.get_dimensions(img)
+        if crop_width == image_width and crop_height == image_height:
+            return img
+    crop_top = int(round((image_height - crop_height) / 2.0))
+    crop_left = int(round((image_width - crop_width) / 2.0))
+    return f.crop(img, crop_top, crop_left, crop_height, crop_width)
+class _CenterCropOrPad(torch.nn.Module):
+    """Crops the given image at the center.
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading
+    dimensions. If image size is smaller than output size along any edge, image is
+    padded with 0 and then center cropped.
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made. If provided a sequence of length 1, it will be interpreted as
+            (size[0], size[0]).
+    """
+    def __init__(self, size, fill=0):
+        super().__init__()
+        self.size = _setup_size(
+            size, error_msg='Please provide only two dimensions (h, w) for size.'
+        )
+        self.fill = fill
+    def forward(self, img):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be cropped.
+        Returns:
+            PIL Image or Tensor: Cropped image.
+        """
+        return _center_crop_or_pad(img, self.size, fill=self.fill)
+    def __repr__(self) -> str:
+        return f'{self.__class__.__name__}(size={self.size})'
+def _convert_to_rgb(image):
+    return image.convert('RGB')
+class _ResizeKeepRatio:
+    """Resize while keeping ratio. Copied from timm"""
     def __init__(
         self,
                 ratio_factor[0] / aspect_factor,
                 ratio_factor[1] * aspect_factor,
             )
+        return [
+            round(x * factor / ratio) for x, factor in zip(source_size, ratio_factor)
+        ]
     def __call__(self, img):
         """
             self.random_aspect_prob,
             self.random_aspect_range,
         )
+        img = f.resize(img, size, self.interpolation)
         return img
     def __repr__(self):
         return format_string
 class _ColorJitter(object):
+    """Apply color jitter to the PIL image with a specified probability"""
     def __init__(self, brightness=0.0, contrast=0.0, saturation=0.0, hue=0.0, p=0.8):
         assert 0.0 <= p <= 1.0
 class _GrayScale(object):
+    """Apply gray scale to the PIL image with a specified probability"""
     def __init__(self, p=0.2):
         assert 0.0 <= p <= 1.0
             return img
+@dataclass
+class AugmentationCfg:
+    scale: Tuple[float, float] = (0.9, 1.0)
+    ratio: Optional[Tuple[float, float]] = None
+    color_jitter: Optional[
+        Union[float, Tuple[float, float, float], Tuple[float, float, float, float]]
+    ] = None
+    re_prob: Optional[float] = None
+    re_count: Optional[int] = None
+    use_timm: bool = False
+    color_jitter_prob: float = None
+    gray_scale_prob: float = None
 def image_transform(
     image_size: Union[int, Tuple[int, int]],
     is_train: bool,
     else:
         if resize_mode == 'longest':
             transforms = [
+                _ResizeKeepRatio(
                     image_size, interpolation=interpolation_mode, longest=1
                 ),
+                _CenterCropOrPad(image_size, fill=fill_color),
             ]
         elif resize_mode == 'squash':
             if isinstance(image_size, int):
                 transforms = [Resize(image_size[0], interpolation=interpolation_mode)]
             else:
                 # resize shortest edge to matching target dim for non-square target
+                transforms = [_ResizeKeepRatio(image_size)]
             transforms += [CenterCrop(image_size)]
         transforms.extend(
             ]
         )
         return Compose(transforms)