include modeling files in repo

Browse files

Files changed (5) hide show

__init__.py +0 -0
configuration_kosmos2_5.py +330 -0
image_processing_kosmos2_5.py +341 -0
modeling_kosmos2_5.py +0 -0
processing_kosmos2_5.py +145 -0

__init__.py ADDED Viewed

File without changes

configuration_kosmos2_5.py ADDED Viewed

	@@ -0,0 +1,330 @@

+# coding=utf-8
+# Copyright 2024 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""KOSMOS-2.5.5 model configuration"""
+import os
+from typing import Union
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class Kosmos2_5TextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Kosmos2_5TextModel`]. It is used to instantiate a
+    KOSMOS-2.5 text decoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the text decoder of the KOSMOS-2.5
+    [microsoft/KOSMOS-2.5](https://huggingface.co/microsoft/KOSMOS-2.5) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 108481):
+            Vocabulary size of the Kosmos2_5 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Kosmos2_5Model`].
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        embed_dim (`int`, *optional*, defaults to 2048):
+            Dimensionality of the layers and the pooler layer.
+        layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        ffn_dim (`int`, *optional*, defaults to 8192):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        scale_embedding (`bool`, *optional*, defaults to `True`):
+            Scale embeddings by diving by sqrt(embed_dim).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+    ```"""
+    model_type = "kosmos_2_5_text_model"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "num_attention_heads": "attention_heads",
+        "hidden_size": "embed_dim",
+        "num_hidden_layers": "layers",
+    }
+    def __init__(
+        self,
+        vocab_size=108481,
+        max_position_embeddings=4096,
+        embed_dim=1536,
+        layers=24,
+        ffn_dim=6144,
+        attention_heads=16,
+        activation_function="gelu",
+        dropout=0.1,
+        attention_dropout=0,
+        activation_dropout=0.0,
+        layerdrop=0.0,
+        layer_norm_eps=1e-5,
+        init_std=0.02,
+        scale_embedding=True,
+        use_cache=True,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.embed_dim = embed_dim
+        self.layers = layers
+        self.ffn_dim = ffn_dim
+        self.attention_heads = attention_heads
+        self.activation_function = activation_function
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.layerdrop = layerdrop
+        self.layer_norm_eps = layer_norm_eps
+        self.init_std = init_std
+        self.scale_embedding = scale_embedding
+        self.use_cache = use_cache
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        # get the text config dict if we are loading from Kosmos2_5Config
+        if config_dict.get("model_type") == "kosmos-2.5":
+            config_dict = config_dict["text_config"]
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+        return cls.from_dict(config_dict, **kwargs)
+class Kosmos2_5VisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Kosmos2_5VisionModel`]. It is used to
+    instantiate a Kosmos2_5 vision model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration defaults will yield a similar configuration to that of the kosmos-2.5
+    [microsoft/kosmos-2.5](https://huggingface.co/microsoft/kosmos-2.5) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        patch_embed_hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the input patch_embedding layer in the Transformer encoder.
+        d_ff (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        d_kv (`int`, *optional*, defaults to 64):
+            Dimensionality of the key, query, value projections per attention head.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        dense_act_fn (`str` or `function`, *optional*, defaults to `"gelu_new"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        dropout_rate (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 1e-10):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1.0):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+        seq_len (`int`, *optional*, defaults to 4096):
+            Maximum sequence length (here number of patches) supported by the model.
+    Example:
+    ```python
+    >>> from transformers import Kosmos2_5VisionConfig, Kosmos2_5VisionModel
+    >>> # Initializing a Kosmos2_5VisionConfig with microsoft/kosmos-2.5 style configuration
+    >>> configuration = Kosmos2_5VisionConfig()
+    >>> # Initializing a Kosmos2_5VisionModel (with random weights) from the microsoft/kosmos-2.5 style configuration
+    >>> model = Kosmos2_5VisionModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "kosmos_2_5_vision_model"
+    def __init__(
+        self,
+        hidden_size=1536,
+        patch_embed_hidden_size=768,
+        d_ff=3968,
+        d_kv=64,
+        num_hidden_layers=18,
+        num_attention_heads=24,
+        dense_act_fn="gelu_new",
+        layer_norm_eps=1e-6,
+        dropout_rate=0.0,
+        attention_dropout=0.0,
+        initializer_range=1e-10,
+        initializer_factor=1.0,
+        seq_len=4096,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.patch_embed_hidden_size = patch_embed_hidden_size
+        self.d_ff = d_ff
+        self.dropout_rate = dropout_rate
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.dense_act_fn = dense_act_fn
+        self.seq_len = seq_len
+        self.d_kv = d_kv
+    @classmethod
+    def from_pretrained(
+        cls, pretrainehidden_size_name_or_path: Union[str, os.PathLike], **kwargs
+    ) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+        config_dict, kwargs = cls.get_config_dict(pretrainehidden_size_name_or_path, **kwargs)
+        # get the vision config dict if we are loading from Kosmos2_5Config
+        if config_dict.get("model_type") == "Kosmos2_5":
+            config_dict = config_dict["vision_config"]
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+        return cls.from_dict(config_dict, **kwargs)
+class Kosmos2_5Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Kosmos2_5Model`]. It is used to instantiate a
+    KOSMOS-2.5 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the KOSMOS-2.5
+    [microsoft/KOSMOS-2.5-patch14-224](https://huggingface.co/microsoft/KOSMOS-2.5-patch14-224) architecture.
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`Kosmos2_5TextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`Kosmos2_5VisionConfig`].
+        latent_query_num (`int`, *optional*, defaults to 2048):
+            The number of latent query tokens that represent the image features used in the text decoder component.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+    Example:
+    ```python
+    >>> from .. import Kosmos2_5Config, Kosmos2_5Model
+    >>> # Initializing a KOSMOS-2.5 KOSMOS-2.5-patch14-224 style configuration
+    >>> configuration = Kosmos2_5Config()
+    >>> # Initializing a model (with random weights) from the KOSMOS-2.5-patch14-224 style configuration
+    >>> model = Kosmos2_5Model(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "kosmos-2.5"
+    is_composition = True
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        latent_query_num=2048,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if text_config is None:
+            text_config = {}
+            logger.info("text_config is None. Initializing the Kosmos2_5TextConfig with default values.")
+        if vision_config is None:
+            vision_config = {}
+            logger.info("vision_config is None. Initializing the Kosmos2_5VisionConfig with default values.")
+        self.text_config = Kosmos2_5TextConfig(**text_config)
+        self.vision_config = Kosmos2_5VisionConfig(**vision_config)
+        self.latent_query_num = latent_query_num
+    @classmethod
+    def from_text_vision_configs(
+        cls,
+        text_config: Kosmos2_5TextConfig,
+        vision_config: Kosmos2_5VisionConfig,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a [`Pix2StructConfig`] (or a derived class) from pix2struct text model configuration and pix2struct
+        vision model configuration.
+        Returns:
+            [`Pix2StructConfig`]: An instance of a configuration object
+        """
+        return cls(
+            text_config=text_config.to_dict(),
+            vision_config=vision_config.to_dict(),
+            **kwargs,
+        )

image_processing_kosmos2_5.py ADDED Viewed

	@@ -0,0 +1,341 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Kosmos2_5."""
+import math
+from typing import Dict, Optional, Union
+import numpy as np
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
+from transformers.image_transforms import (
+    convert_to_rgb,
+    normalize,
+    to_channel_dimension_format,
+)
+from transformers.image_utils import (
+    ChannelDimension,
+    ImageInput,
+    get_image_size,
+    infer_channel_dimension_format,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from transformers.utils import TensorType, is_torch_available, logging
+from transformers.utils.import_utils import requires_backends
+if is_torch_available():
+    import torch
+logger = logging.get_logger(__name__)
+DEFAULT_FONT_PATH = "ybelkada/fonts"
+# adapted from: https://discuss.pytorch.org/t/tf-image-extract-patches-in-pytorch/171409/2
+def torch_extract_patches(image_tensor, patch_height, patch_width):
+    """
+    Utiliy function to extract patches from a given image tensor. Returns a tensor of shape (1, `patch_height`,
+    `patch_width`, `num_channels`x `patch_height` x `patch_width`)
+    Args:
+        image_tensor (torch.Tensor):
+            The image tensor to extract patches from.
+        patch_height (int):
+            The height of the patches to extract.
+        patch_width (int):
+            The width of the patches to extract.
+    """
+    requires_backends(torch_extract_patches, ["torch"])
+    image_tensor = image_tensor.unsqueeze(0)
+    patches = torch.nn.functional.unfold(image_tensor, (patch_height, patch_width), stride=(patch_height, patch_width))
+    patches = patches.reshape(image_tensor.size(0), image_tensor.size(1), patch_height, patch_width, -1)
+    patches = patches.permute(0, 4, 2, 3, 1).reshape(
+        image_tensor.size(2) // patch_height,
+        image_tensor.size(3) // patch_width,
+        image_tensor.size(1) * patch_height * patch_width,
+    )
+    return patches.unsqueeze(0)
+class Kosmos2_5ImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Kosmos2_5 image processor.
+    Args:
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method. According to Kosmos2_5 paper and code, the image is normalized with its own mean and standard
+            deviation.
+        patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
+            The patch size to use for the image. According to Kosmos2_5 paper and code, the patch size is 16x16.
+        max_patches (`int`, *optional*, defaults to 4096):
+            The maximum number of patches to extract from the image as per the [Kosmos2_5
+            paper](https://arxiv.org/pdf/2309.11419).
+    """
+    model_input_names = ["flattened_patches"]
+    def __init__(
+        self,
+        do_convert_rgb: bool = True,
+        do_normalize: bool = True,
+        patch_size: Dict[str, int] = None,
+        max_patches: int = 4096,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.patch_size = patch_size if patch_size is not None else {"height": 16, "width": 16}
+        self.do_normalize = do_normalize
+        self.do_convert_rgb = do_convert_rgb
+        self.max_patches = max_patches
+    def extract_flattened_patches(
+        self,
+        image: np.ndarray,
+        max_patches: int,
+        patch_size: dict,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Extract flattened patches from an image.
+        Args:
+            image (`np.ndarray`):
+                Image to extract flattened patches from.
+            max_patches (`int`):
+                Maximum number of patches to extract.
+            patch_size (`dict`):
+                Dictionary containing the patch height and width.
+        Returns:
+            result (`np.ndarray`):
+                A sequence of `max_patches` flattened patches.
+        """
+        requires_backends(self.extract_flattened_patches, "torch")
+        # convert to torch
+        image = to_channel_dimension_format(image, ChannelDimension.FIRST, input_data_format)
+        image = torch.from_numpy(image)
+        patch_height, patch_width = patch_size["height"], patch_size["width"]
+        image_height, image_width = get_image_size(image, ChannelDimension.FIRST)
+        # maximize scale s.t.
+        scale = math.sqrt(max_patches * (patch_height / image_height) * (patch_width / image_width))
+        num_feasible_rows = max(min(math.floor(scale * image_height / patch_height), max_patches), 1)
+        num_feasible_cols = max(min(math.floor(scale * image_width / patch_width), max_patches), 1)
+        resized_height = max(num_feasible_rows * patch_height, 1)
+        resized_width = max(num_feasible_cols * patch_width, 1)
+        image = torch.nn.functional.interpolate(
+            image.unsqueeze(0),
+            size=(resized_height, resized_width),
+            mode="bilinear",
+            align_corners=False,
+            antialias=True,
+        ).squeeze(0)
+        # [1, rows, columns, patch_height * patch_width * image_channels]
+        patches = torch_extract_patches(image, patch_height, patch_width)
+        patches_shape = patches.shape
+        rows = patches_shape[1]
+        columns = patches_shape[2]
+        depth = patches_shape[3]
+        # [rows * columns, patch_height * patch_width * image_channels]
+        patches = patches.reshape([rows * columns, depth])
+        # [rows * columns, 1]
+        row_ids = torch.arange(rows).reshape([rows, 1]).repeat(1, columns).reshape([rows * columns, 1])
+        col_ids = torch.arange(columns).reshape([1, columns]).repeat(rows, 1).reshape([rows * columns, 1])
+        # Offset by 1 so the ids do not contain zeros, which represent padding.
+        row_ids += 1
+        col_ids += 1
+        # Prepare additional patch features.
+        # [rows * columns, 1]
+        row_ids = row_ids.to(torch.float32)
+        col_ids = col_ids.to(torch.float32)
+        # [rows * columns, 2 + patch_height * patch_width * image_channels]
+        result = torch.cat([row_ids, col_ids, patches], -1)
+        # [max_patches, 2 + patch_height * patch_width * image_channels]
+        result = torch.nn.functional.pad(result, [0, 0, 0, max_patches - (rows * columns)]).float()
+        result = to_numpy_array(result)
+        return result, resized_width, resized_height, rows, columns
+    def normalize(
+        self,
+        image: np.ndarray,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Normalize an image. image = (image - image_mean) / image_std.
+        The image std is to mimic the tensorflow implementation of the `per_image_standardization`:
+        https://www.tensorflow.org/api_docs/python/tf/image/per_image_standardization
+        Args:
+            image (`np.ndarray`):
+                Image to normalize.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        if image.dtype == np.uint8:
+            image = image.astype(np.float32)
+        # take mean across the whole `image`
+        mean = np.mean(image)
+        std = np.std(image)
+        adjusted_stddev = max(std, 1.0 / math.sqrt(np.prod(image.shape)))
+        return normalize(
+            image,
+            mean=mean,
+            std=adjusted_stddev,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_convert_rgb: bool = None,
+        do_normalize: Optional[bool] = None,
+        max_patches: Optional[int] = None,
+        patch_size: Optional[Dict[str, int]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> ImageInput:
+        """
+        Preprocess an image or batch of images. The processor first computes the maximum possible number of
+        aspect-ratio preserving patches of size `patch_size` that can be extracted from the image. It then pads the
+        image with zeros to make the image respect the constraint of `max_patches`. Before extracting the patches the
+        images are standardized following the tensorflow implementation of `per_image_standardization`
+        (https://www.tensorflow.org/api_docs/python/tf/image/per_image_standardization).
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            max_patches (`int`, *optional*, defaults to `self.max_patches`):
+                Maximum number of patches to extract.
+            patch_size (`dict`, *optional*, defaults to `self.patch_size`):
+                Dictionary containing the patch height and width.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        patch_size = patch_size if patch_size is not None else self.patch_size
+        max_patches = max_patches if max_patches is not None else self.max_patches
+        if kwargs.get("data_format", None) is not None:
+            raise ValueError("data_format is not an accepted input as the outputs are ")
+        images = make_list_of_images(images)
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        # PIL RGBA images are converted to RGB
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+        if do_normalize:
+            images = [self.normalize(image=image, input_data_format=input_data_format) for image in images]
+        # convert to torch tensor and permute
+        images = [
+            self.extract_flattened_patches(
+                image=image,
+                max_patches=max_patches,
+                patch_size=patch_size,
+                input_data_format=input_data_format,
+            )
+            for image in images
+        ]
+        width = [image[1] for image in images]
+        height = [image[2] for image in images]
+        rows = [image[3] for image in images]
+        cols = [image[4] for image in images]
+        images = [image[0] for image in images]
+        # create attention mask in numpy
+        attention_masks = [(image.sum(axis=-1) != 0).astype(np.float32) for image in images]
+        encoded_outputs = BatchFeature(
+            data={
+                "flattened_patches": images,
+                "attention_mask": attention_masks,
+                "width": width,
+                "height": height,
+                "rows": rows,
+                "cols": cols,
+            },
+            tensor_type=return_tensors,
+        )
+        return encoded_outputs

modeling_kosmos2_5.py ADDED Viewed

The diff for this file is too large to render. See raw diff

processing_kosmos2_5.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# coding=utf-8
+# Copyright 2024 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Kosmos2_5.
+"""
+from typing import List, Optional, Union
+from transformers.image_processing_utils import BatchFeature
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import PaddingStrategy, TextInput, TruncationStrategy
+from transformers.utils import TensorType, is_torch_available
+if is_torch_available():
+    import torch
+class Kosmos2_5Processor(ProcessorMixin):
+    r"""
+    Constructs a Kosmos2_5 processor which wraps a BERT tokenizer and Kosmos2_5 image processor into a single
+    processor.
+    [`Kosmos2_5Processor`] offers all the functionalities of [`Kosmos2_5ImageProcessor`] and [`T5TokenizerFast`]. See
+    the docstring of [`~Kosmos2_5Processor.__call__`] and [`~Kosmos2_5Processor.decode`] for more information.
+    Args:
+        image_processor (`Kosmos2_5ImageProcessor`):
+            An instance of [`Kosmos2_5ImageProcessor`]. The image processor is a required input.
+        tokenizer (Union[`T5TokenizerFast`, `T5Tokenizer`]):
+            An instance of ['T5TokenizerFast`] or ['T5Tokenizer`]. The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "Kosmos2_5ImageProcessor"
+    tokenizer_class = "PreTrainedTokenizerFast"
+    def __init__(self, image_processor, tokenizer):
+        tokenizer.return_token_type_ids = False
+        super().__init__(image_processor, tokenizer)
+    def __call__(
+        self,
+        images=None,
+        text: Union[TextInput, List[TextInput]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = True,
+        truncation: Union[bool, str, TruncationStrategy] = True,
+        max_length: Optional[int] = None,
+        max_patches: Optional[int] = 4096,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = "pt",
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        This method uses [`Kosmos2_5ImageProcessor.preprocess`] method to prepare image(s) for the model, and
+        [`PreTrainedTokenizerFast.__call__`] to prepare text for the model.
+        Please refer to the docstring of the above two methods for more information.
+        The rest of this documentation shows the arguments specific to `Kosmos2_5Processor`.
+        """
+        if images is None and text is None:
+            raise ValueError("You have to specify either images or text.")
+        encoding = BatchFeature()
+        if images is not None:
+            image_encoding = self.image_processor(
+                images, return_tensors=return_tensors, max_patches=max_patches, **kwargs
+            )
+            image_encoding.pop("rows")
+            image_encoding.pop("cols")
+            encoding.update(image_encoding)
+        if text is not None:
+            # use updates or pop
+            input = self.tokenizer(
+                text,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+                return_tensors="pt",
+            )
+            batch_size, seq_len = input.input_ids.shape
+            additional_tokens = [0, 100283] + [0] * 2048 + [100284]
+            additional_tokens_tensor = torch.tensor(additional_tokens).unsqueeze(0).repeat(batch_size, 1)
+            input_ids = torch.cat([additional_tokens_tensor, input.input_ids], dim=1)
+            image_embeds_position_mask = [0, -1] + [1] * 2048 + [-1] + [0] * seq_len
+            image_embeds_position_mask = (
+                torch.LongTensor(image_embeds_position_mask).unsqueeze(0).repeat(batch_size, 1)
+            )
+            added_attention_mask = [1, 1] + [1] * 2048 + [1]
+            added_attention_mask_tensor = torch.tensor(added_attention_mask).unsqueeze(0).repeat(batch_size, 1)
+            attention_mask = torch.cat([added_attention_mask_tensor, input.attention_mask], dim=1)
+            encoding.update(
+                {
+                    "input_ids": input_ids,
+                    "attention_mask": attention_mask,
+                    "image_embeds_position_mask": image_embeds_position_mask,
+                }
+            )
+        return encoding
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Kosmos2_5TokenizerFast's [`~PreTrainedTokenizer.batch_decode`].
+        Please refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Kosmos2_5TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))