jinaai
/

jina-clip-implementation

@@ -1,6 +1,6 @@
 import re
 import warnings
-from typing import Dict, Optional
 import torch
 import torch.nn as nn
@@ -208,21 +208,48 @@ class HFTextEncoder(nn.Module):
             self._task_instructions = self.transformer._task_instructions
             self._supports_task_instructions = True
-        self.default_instruction_task = None
-        self.default_lora_task = None
-        self.default_instruction = None
-        self.default_loraid = None
         if default_instruction_task is not None:
-            self.default_instruction_task = default_instruction_task
-            self.default_instruction = self.get_instruction_from_task(
                 default_instruction_task
             )
         if default_lora_task is not None:
-            self.default_lora_task = default_lora_task
-            self.default_loraid = self.get_loraid_from_task(default_lora_task)
-    def get_instruction_from_task(self, task: str) -> Optional[str]:
         if self._supports_task_instructions:
             if task not in self._task_instructions:
                 raise ValueError(
                     f'Unsupported task \'{task}\'. Choose one of the following: '
@@ -231,14 +258,17 @@ class HFTextEncoder(nn.Module):
                 )
             return self._task_instructions[task]
         else:
-            warnings.warn(
-                'Model does not support task instructions, ignoring instruction '
-                f"task '{task}'"
-            )
         return None
-    def get_loraid_from_task(self, task: str) -> Optional[int]:
         if self._supports_lora:
             if task not in self._lora_adaptation_map:
                 raise ValueError(
                     f'Unsupported task \'{task}\'. Choose one of the following: '
@@ -247,11 +277,18 @@ class HFTextEncoder(nn.Module):
                 )
             return self._lora_adaptation_map[task]
         else:
-            warnings.warn(
-                f"Model does not support LoRA adapters, ignoring LoRA task '{task}'"
-            )
         return None
     @torch.jit.ignore
     def set_grad_checkpointing(self, _=True):
         self.transformer.gradient_checkpointing_enable()
@@ -260,12 +297,28 @@ class HFTextEncoder(nn.Module):
         pass
     def forward(self, x: torch.Tensor, adapter_mask: Optional[torch.Tensor] = None):
-        attn_mask = (x != self.config.pad_token_id).long()
-        kwargs = {}
         if adapter_mask is not None:
-            kwargs['adapter_mask'] = adapter_mask
-        out = self.transformer(input_ids=x, attention_mask=attn_mask, **kwargs)
-        pooled_out = self.pooler(out, attn_mask)
         projected = self.proj(pooled_out)
         seqlen = out.last_hidden_state.shape[1]
         tokens = (

 import re
 import warnings
+from typing import Dict, Optional, Union
 import torch
 import torch.nn as nn
             self._task_instructions = self.transformer._task_instructions
             self._supports_task_instructions = True
+        self._default_instruction_task = None
+        self._default_lora_task = None
+        self._default_instruction = None
+        self._default_loraid = None
         if default_instruction_task is not None:
+            self._default_instruction_task = default_instruction_task
+            self._default_instruction = self.get_instruction_from_task(
                 default_instruction_task
             )
         if default_lora_task is not None:
+            self._default_lora_task = default_lora_task
+            self._default_loraid = self.get_loraid_from_task(default_lora_task)
+    @property
+    def supports_task_instructions(self) -> bool:
+        return self._supports_task_instructions
+    @property
+    def supports_lora(self) -> bool:
+        return self._supports_lora
+    @property
+    def task_instructions(self) -> Dict[str, str]:
+        return self._task_instructions
+    @property
+    def lora_adaptation_map(self) -> Dict[str, int]:
+        return self._lora_adaptation_map
+    @property
+    def default_instruction(self) -> Optional[str]:
+        return self._default_instruction
+    @property
+    def default_loraid(self) -> Optional[int]:
+        return self._default_loraid
+    def get_instruction_from_task(self, task: Optional[str]) -> Optional[str]:
         if self._supports_task_instructions:
+            if task is None:
+                return self._default_instruction
             if task not in self._task_instructions:
                 raise ValueError(
                     f'Unsupported task \'{task}\'. Choose one of the following: '
                 )
             return self._task_instructions[task]
         else:
+            if task is not None:
+                warnings.warn(
+                    'Model does not support task instructions, ignoring instruction '
+                    f"task '{task}'"
+                )
         return None
+    def get_loraid_from_task(self, task: Optional[str]) -> Optional[int]:
         if self._supports_lora:
+            if task is None:
+                return self._default_loraid
             if task not in self._lora_adaptation_map:
                 raise ValueError(
                     f'Unsupported task \'{task}\'. Choose one of the following: '
                 )
             return self._lora_adaptation_map[task]
         else:
+            if task is not None:
+                warnings.warn(
+                    f"Model does not support LoRA adapters, ignoring LoRA task '{task}'"
+                )
         return None
+    @staticmethod
+    def get_adapter_mask_from_loraid(
+        batch_size: int, loraid: int, device: Union[str, torch.device]
+    ):
+        return torch.full((batch_size,), loraid, dtype=torch.int32, device=device)
     @torch.jit.ignore
     def set_grad_checkpointing(self, _=True):
         self.transformer.gradient_checkpointing_enable()
         pass
     def forward(self, x: torch.Tensor, adapter_mask: Optional[torch.Tensor] = None):
+        if adapter_mask is None:
+            default_loraid = self.default_loraid
+            if default_loraid is not None:
+                adapter_mask = self.get_adapter_mask_from_loraid(
+                    x.shape[0], default_loraid, x.device
+                )
+        else:
+            if not self.supports_lora:
+                warnings.warn(
+                    'Model does not support LoRA adapters, setting adapter_mask to None'
+                )
+                adapter_mask = None
+        attention_mask = (x != self.config.pad_token_id).long()
+        lora_kwargs = {}
         if adapter_mask is not None:
+            lora_kwargs['adapter_mask'] = adapter_mask
+        out = self.transformer(
+            input_ids=x, attention_mask=attention_mask, **lora_kwargs
+        )
+        pooled_out = self.pooler(out, attention_mask)
         projected = self.proj(pooled_out)
         seqlen = out.last_hidden_state.shape[1]
         tokens = (

modeling_clip.py CHANGED Viewed

@@ -159,9 +159,6 @@ class JinaCLIPTextModel(JinaCLIPPreTrainedModel):
         self,
         input_ids: Union[None, torch.Tensor, BatchEncoding] = None,
         return_dict: Optional[bool] = None,
-        use_lora: bool = False,
-        adapter_mask: Optional[torch.Tensor] = None,
-        task: Optional[str] = None,
         *_,
         **__,
     ) -> Union[Tuple[Optional[torch.FloatTensor], ...], CLIPTextModelOutput]:
@@ -169,12 +166,7 @@ class JinaCLIPTextModel(JinaCLIPPreTrainedModel):
             return_dict if return_dict is not None else self.config.use_return_dict
         )
         x = input_ids.input_ids if isinstance(input_ids, BatchEncoding) else input_ids
-        feats = self.text_model(
-            x=x,
-            use_lora=use_lora,
-            adapter_mask=adapter_mask,
-            task=task,
-        )
         out = CLIPTextModelOutput(text_embeds=feats)
         return out if return_dict else out.to_tuple()
@@ -277,12 +269,11 @@ class JinaCLIPModel(JinaCLIPPreTrainedModel):
     def get_text_features(
         self,
         input_ids: Union[None, torch.Tensor, BatchEncoding] = None,
-        adapter_mask: Optional[torch.Tensor] = None,
         *_,
         **__,
     ) -> torch.FloatTensor:
         x = input_ids.input_ids if isinstance(input_ids, BatchEncoding) else input_ids
-        return self.text_projection(self.text_model(x=x, adapter_mask=adapter_mask))
     def get_image_features(
         self,
@@ -461,9 +452,9 @@ class JinaCLIPModel(JinaCLIPPreTrainedModel):
             sentences(`str` or `List[str]`):
                 Sentence or sentences to be encoded
             task(`str`, *optional*, defaults to `None`):
-                Specifies the task for which the encoding is intended. If `task` is
-                not provided, all LoRA adapters are disabled, and the model reverts
-                to its original, general-purpose weights
             batch_size(`int`, *optional*, defaults to 32):
                 Batch size for the computation
             show_progress_bar(`bool`, *optional*, defaults to None):
@@ -534,35 +525,17 @@ class JinaCLIPModel(JinaCLIPPreTrainedModel):
         truncate_dim = truncate_dim or self.config.truncate_dim
-        instruction = self.text_model.default_instruction
-        loraid = self.text_model.default_loraid
-        if task:
-            _selected_instruction = self.text_model.get_instruction_from_task(task)
-            if _selected_instruction is not None:
-                instruction = _selected_instruction
-            _selected_loraid = self.text_model.get_loraid_from_task(task)
-            if _selected_loraid is not None:
-                loraid = _selected_loraid
-        if instruction is not None:
             sentences = [instruction + sentence for sentence in sentences]
-        adapter_mask = None
-        if loraid is not None:
-            nexamples = 1 if isinstance(sentences, str) else len(sentences)
-            adapter_mask = torch.full(
-                (nexamples,), loraid, dtype=torch.int32, device=self.device
-            )
         for i in range_iter:
             tokens = self.tokenizer(
                 sentences[i: i + batch_size],
                 return_tensors='pt',
                 **tokenizer_kwargs,
             ).to(self.device)
-            embeddings = self.get_text_features(
-                input_ids=tokens, adapter_mask=adapter_mask
-            )
             if truncate_dim:
                 embeddings = self.truncate_embeddings(embeddings, truncate_dim)
             if normalize_embeddings:
@@ -589,7 +562,6 @@ class JinaCLIPModel(JinaCLIPPreTrainedModel):
         self,
         input_ids: Union[None, torch.Tensor, BatchEncoding] = None,
         pixel_values: Union[None, torch.FloatTensor, BatchFeature] = None,
-        adapter_mask: Optional[torch.Tensor] = None,
         return_dict: Optional[bool] = None,
         return_loss: Optional[bool] = None,
         *_,
@@ -599,9 +571,8 @@ class JinaCLIPModel(JinaCLIPPreTrainedModel):
             return_dict if return_dict is not None else self.config.use_return_dict
         )
         image_embeds = self.get_image_features(pixel_values=pixel_values)
-        text_embeds = self.get_text_features(
-            input_ids=input_ids, adapter_mask=adapter_mask
-        )
         # normalized features
         image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
         text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)

         self,
         input_ids: Union[None, torch.Tensor, BatchEncoding] = None,
         return_dict: Optional[bool] = None,
         *_,
         **__,
     ) -> Union[Tuple[Optional[torch.FloatTensor], ...], CLIPTextModelOutput]:
             return_dict if return_dict is not None else self.config.use_return_dict
         )
         x = input_ids.input_ids if isinstance(input_ids, BatchEncoding) else input_ids
+        feats = self.text_model(x=x)
         out = CLIPTextModelOutput(text_embeds=feats)
         return out if return_dict else out.to_tuple()
     def get_text_features(
         self,
         input_ids: Union[None, torch.Tensor, BatchEncoding] = None,
         *_,
         **__,
     ) -> torch.FloatTensor:
         x = input_ids.input_ids if isinstance(input_ids, BatchEncoding) else input_ids
+        return self.text_projection(self.text_model(x=x))
     def get_image_features(
         self,
             sentences(`str` or `List[str]`):
                 Sentence or sentences to be encoded
             task(`str`, *optional*, defaults to `None`):
+                Specifies the task for which the encoding is intended. If a `task` is
+                provided, a task-specific instruction is added to the beginning of each
+                sentence. If `task` is not provided, no instructions are added.
             batch_size(`int`, *optional*, defaults to 32):
                 Batch size for the computation
             show_progress_bar(`bool`, *optional*, defaults to None):
         truncate_dim = truncate_dim or self.config.truncate_dim
+        instruction = self.text_model.get_instruction_from_task(task)
+        if instruction:
             sentences = [instruction + sentence for sentence in sentences]
         for i in range_iter:
             tokens = self.tokenizer(
                 sentences[i: i + batch_size],
                 return_tensors='pt',
                 **tokenizer_kwargs,
             ).to(self.device)
+            embeddings = self.get_text_features(input_ids=tokens)
             if truncate_dim:
                 embeddings = self.truncate_embeddings(embeddings, truncate_dim)
             if normalize_embeddings:
         self,
         input_ids: Union[None, torch.Tensor, BatchEncoding] = None,
         pixel_values: Union[None, torch.FloatTensor, BatchFeature] = None,
         return_dict: Optional[bool] = None,
         return_loss: Optional[bool] = None,
         *_,
             return_dict if return_dict is not None else self.config.use_return_dict
         )
         image_embeds = self.get_image_features(pixel_values=pixel_values)
+        text_embeds = self.get_text_features(input_ids=input_ids)
         # normalized features
         image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
         text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)