Add task_prefix_attention_mask argument to _merge_input_ids_with_image_features for better padding handling

This PR introduces a small change in the _merge_input_ids_with_image_features function by adding a task_prefix_attention_mask=None argument. This enhancement ensures that when doing batch processing with padding to the max length, the attention mask correctly ignores padding tokens.

Changes Made:
1. Added task_prefix_attention_mask=None argument to _merge_input_ids_with_image_features function.
2. Updated the function to incorporate the provided attention mask, allowing it to ignore padding tokens during batch processing.

Below is an example demonstrating the issue and the improvement:
```python
prompts =["prompt", "longer prompt", "much much longer prompt"]

url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"

image = Image.open(requests.get(url, stream=True).raw)
images = [image] * len(prompts)

inputs = processor(text=prompts, images=images, return_tensors="pt", padding=True).to("cuda", torch.float16)

inputs_embeds = model.get_input_embeddings()(inputs.input_ids)
image_features = model._encode_image(inputs.pixel_values)

print(inputs.input_ids)
# Output:
# tensor([[ 0, 12501, 3320, 2, 1, 1],
# [ 0, 3479, 254, 14302, 2, 1],
# [ 0, 28431, 203, 1181, 14302, 2]], device='cuda:0')

# Before change
inputs_embeds, attention_mask = model._merge_input_ids_with_image_features(image_features, inputs_embeds)
print(attention_mask[:, -10:])
# Output:
# tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
# [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
# [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]], device='cuda:0')

# After change
inputs_embeds, attention_mask = model._merge_input_ids_with_image_features(image_features, inputs_embeds, task_prefix_attention_mask=inputs.attention_mask)
print(attention_mask[:, -10:])
# Output:
# tensor([[1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],
# [1., 1., 1., 1., 1., 1., 1., 1., 1., 0.],
# [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]], device='cuda:0')
```

Files changed (1) hide show

modeling_florence2.py +13 -8

modeling_florence2.py CHANGED Viewed

@@ -2643,7 +2643,7 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
         return x
     def _merge_input_ids_with_image_features(
-        self, image_features, inputs_embeds
     ):
         batch_size, image_token_length = image_features.size()[:-1]
         device = image_features.device
@@ -2655,10 +2655,12 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
             return image_features, image_attention_mask
         task_prefix_embeds = inputs_embeds
-        task_prefix_attention_mask = torch.ones(batch_size, task_prefix_embeds.size(1), device=device)
-        if len(task_prefix_attention_mask.shape) == 3:
-            task_prefix_attention_mask = task_prefix_attention_mask[:, 0]
         # concat [image embeds, task prefix embeds]
         inputs_embeds = torch.cat([image_features, task_prefix_embeds], dim=1)
@@ -2719,6 +2721,7 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
         >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "A green car parked in front of a yellow building."
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -2734,8 +2737,8 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
             if pixel_values is not None:
                 # (batch_size, num_image_tokens, hidden_size)
                 image_features = self._encode_image(pixel_values)
-                inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds)
         if inputs_embeds is not None:
             attention_mask = attention_mask.to(inputs_embeds.dtype)
         outputs = self.language_model(
@@ -2781,6 +2784,7 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
         input_ids,
         inputs_embeds=None,
         pixel_values=None,
         **kwargs
         ):
@@ -2791,11 +2795,12 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
             # 2. Merge text and images
             if pixel_values is not None:
                 image_features = self._encode_image(pixel_values)
-                inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds)
         return self.language_model.generate(
             input_ids=None,
             inputs_embeds=inputs_embeds,
             **kwargs
         )
@@ -2844,4 +2849,4 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
         return self.language_model.shift_tokens_right(labels)
     def _reorder_cache(self, *args, **kwargs):
-        return self.language_model._reorder_cache(*args, **kwargs)

         return x
     def _merge_input_ids_with_image_features(
+        self, image_features, inputs_embeds, task_prefix_attention_mask=None
     ):
         batch_size, image_token_length = image_features.size()[:-1]
         device = image_features.device
             return image_features, image_attention_mask
         task_prefix_embeds = inputs_embeds
+        if task_prefix_attention_mask is None:
+            task_prefix_attention_mask = torch.ones(batch_size, task_prefix_embeds.size(1), device=device)
+            if len(task_prefix_attention_mask.shape) == 3:
+                task_prefix_attention_mask = task_prefix_attention_mask[:, 0]
         # concat [image embeds, task prefix embeds]
         inputs_embeds = torch.cat([image_features, task_prefix_embeds], dim=1)
         >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "A green car parked in front of a yellow building."
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
             if pixel_values is not None:
                 # (batch_size, num_image_tokens, hidden_size)
                 image_features = self._encode_image(pixel_values)
+                inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds, task_prefix_attention_mask=attention_mask)
         if inputs_embeds is not None:
             attention_mask = attention_mask.to(inputs_embeds.dtype)
         outputs = self.language_model(
         input_ids,
         inputs_embeds=None,
         pixel_values=None,
+        attention_mask=None,
         **kwargs
         ):
             # 2. Merge text and images
             if pixel_values is not None:
                 image_features = self._encode_image(pixel_values)
+                inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds, task_prefix_attention_mask=attention_mask)
         return self.language_model.generate(
             input_ids=None,
             inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
             **kwargs
         )
         return self.language_model.shift_tokens_right(labels)
     def _reorder_cache(self, *args, **kwargs):
+        return self.language_model._reorder_cache(*args, **kwargs)