add initial files

Browse files

Files changed (10) hide show

cambrian_arch.py +1712 -0
config.json +97 -0
modeling.py +546 -0
multimodal_encoder_builder.py +368 -0
multimodal_projector_builder.py +52 -0
pytorch_model.bin +3 -0
special_tokens_map.json +16 -0
tokenizer.json +0 -0
tokenizer_config.json +2062 -0
vision_sampler.py +566 -0

cambrian_arch.py ADDED Viewed

	@@ -0,0 +1,1712 @@

+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+import math
+import random
+from abc import ABC, abstractmethod
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# define the constants
+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+LOGDIR = "."
+# Model Constants
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+IMAGE_PLACEHOLDER = "<image-placeholder>"
+from .multimodal_encoder_builder import build_vision_tower_aux_list
+from .multimodal_projector_builder import build_vision_projector
+from .vision_sampler import VisionTokenSampler
+IS_XLA_AVAILABLE = False
+class CambrianMetaModel:
+    def __init__(self, config):
+        super(CambrianMetaModel, self).__init__(config)
+        if hasattr(config, "mm_vision_tower_aux_list"):
+            projector_type = getattr(config, "mm_projector_type", "linear")
+            if projector_type == "sva":
+                vision_hidden_size = config.vision_hidden_size
+                num_query_group = config.num_query_group
+                query_num_list = config.query_num_list
+                connector_only = config.connector_only
+                connector_depth = config.connector_depth
+                self.vision_tower_aux_list = build_vision_tower_aux_list(
+                    config, delay_load=True
+                )
+                self.mm_projector = nn.Sequential(
+                    nn.Linear(vision_hidden_size * num_query_group, config.hidden_size),
+                    nn.GELU(),
+                    nn.Linear(config.hidden_size, config.hidden_size),
+                )
+                image_token_len = config.image_token_len
+                vision_tower_aux_token_len_list = (
+                    self.config.mm_vision_tower_aux_token_len_list
+                )
+                cross_att_token_len_list = [
+                    int(vision_tower_aux_token_len**0.5) // int(image_token_len**0.5)
+                    for vision_tower_aux_token_len in vision_tower_aux_token_len_list
+                ]
+                for aux_i, vision_tower_aux in enumerate(self.vision_tower_aux_list):
+                    setattr(
+                        self,
+                        "mm_projector_aux_{}".format(aux_i),
+                        nn.Sequential(
+                            nn.Linear(vision_tower_aux.hidden_size, vision_hidden_size),
+                            nn.GELU(),
+                            nn.Linear(vision_hidden_size, vision_hidden_size),
+                            nn.LayerNorm(vision_hidden_size),
+                        ),
+                    )
+                for query_group_i in range(num_query_group):
+                    cross_att_token_len_list = [
+                        int(vision_tower_aux_token_len**0.5)
+                        // int(query_num_list[query_group_i] ** 0.5)
+                        for vision_tower_aux_token_len in vision_tower_aux_token_len_list
+                    ]
+                    setattr(
+                        self,
+                        "vision_sampler_{}".format(query_group_i),
+                        VisionTokenSampler(
+                            vision_hidden_size,
+                            vision_hidden_size,
+                            [vision_hidden_size] * len(self.vision_tower_aux_list),
+                            cross_att_token_len_list,
+                            vision_hidden_size,
+                            connector_depth,
+                        ),
+                    )
+                if not connector_only:
+                    num_of_vision_sampler_layers = (
+                        config.num_of_vision_sampler_layers
+                    ) = config.num_of_vision_sampler_layers
+                    config.start_of_vision_sampler_layers = (
+                        config.start_of_vision_sampler_layers
+                    )
+                    config.stride_of_vision_sampler_layers = (
+                        config.stride_of_vision_sampler_layers
+                    )
+                    cross_att_token_len_list = [
+                        int(vision_tower_aux_token_len**0.5)
+                        // int(image_token_len**0.5)
+                        for vision_tower_aux_token_len in vision_tower_aux_token_len_list
+                    ]
+                    self.vision_sampler_layers = nn.ModuleList(
+                        [
+                            VisionTokenSampler(
+                                config.hidden_size,
+                                vision_hidden_size,
+                                [vision_hidden_size] * len(self.vision_tower_aux_list),
+                                cross_att_token_len_list,
+                                vision_hidden_size,
+                                1,
+                            )
+                            for layer_idx in range(0, num_of_vision_sampler_layers)
+                        ]
+                    )
+                self.vision_query = nn.Parameter(
+                    torch.randn((num_query_group, vision_hidden_size), dtype=self.dtype)
+                )
+                self.image_newline = nn.Parameter(
+                    torch.empty(config.hidden_size, dtype=self.dtype)
+                )
+                self.frame_pos = torch.stack(
+                    [
+                        1
+                        / torch.pow(
+                            torch.tensor(10000),
+                            torch.tensor(2 * (hid_j // 2) / config.hidden_size),
+                        )
+                        for hid_j in range(config.hidden_size)
+                    ]
+                )
+            else:
+                self.vision_tower_aux_list = build_vision_tower_aux_list(
+                    config, delay_load=True
+                )
+                config.mm_hidden_size = sum(
+                    [
+                        vision_tower_aux.hidden_size
+                        for vision_tower_aux in self.vision_tower_aux_list
+                    ]
+                )
+                self.mm_projector = build_vision_projector(config)
+                self.image_newline = nn.Parameter(
+                    torch.empty(config.hidden_size, dtype=self.dtype)
+                )
+    def get_frame_pos(self, time_range):
+        frame_pos = self.frame_pos.reshape(1, -1) * time_range.reshape(-1, 1).to(
+            self.frame_pos.device
+        )
+        frame_pos[:, 0::2] = torch.sin(frame_pos[:, 0::2])
+        frame_pos[:, 1::2] = torch.cos(frame_pos[:, 0::2])
+        frame_pos = frame_pos.unsqueeze(1)
+        return frame_pos
+    # def get_vision_tower(self):
+    #     vision_tower = getattr(self, 'vision_tower', None)
+    #     if type(vision_tower) is list:
+    #         vision_tower = vision_tower[0]
+    #     return vision_tower
+    def get_vision_tower_aux_list(self):
+        vision_tower_aux_list = getattr(self, "vision_tower_aux_list", None)
+        return vision_tower_aux_list
+    def initialize_vision_modules(self, model_args, fsdp=None):
+        # vision_tower = model_args.vision_tower
+        num_query_group = model_args.num_query_group
+        query_num_list = model_args.query_num_list
+        vision_hidden_size = model_args.vision_hidden_size
+        vision_tower_aux_list = model_args.vision_tower_aux_list
+        vision_tower_aux_token_len_list = model_args.vision_tower_aux_token_len_list
+        image_token_len = model_args.image_token_len
+        mm_vision_select_layer = model_args.mm_vision_select_layer
+        mm_vision_select_feature = model_args.mm_vision_select_feature
+        pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
+        connector_only = model_args.connector_only
+        connector_depth = model_args.connector_depth
+        # self.config.mm_vision_tower = vision_tower
+        self.config.image_token_len = image_token_len
+        self.config.num_query_group = num_query_group
+        self.config.query_num_list = query_num_list
+        assert num_query_group == len(query_num_list)
+        self.config.connector_depth = connector_depth
+        self.config.mm_vision_tower_aux_list = vision_tower_aux_list
+        self.config.mm_vision_tower_aux_token_len_list = vision_tower_aux_token_len_list
+        self.config.connector_only = connector_only
+        self.config.highres_connect = model_args.highres_connect
+        self.config.highres = model_args.highres
+        self.config.frame_pos = model_args.frame_pos
+        self.config.lowres_token = model_args.lowres_token
+        self.config.connect_layer = model_args.connect_layer
+        self.config.dino_threshold = getattr(model_args, "dino_threshold", 0.83)
+        self.config.drop_threshold = getattr(model_args, "drop_threshold", 0.6)
+        self.config.is_image_newline = getattr(model_args, "is_image_newline", True)
+        if self.get_vision_tower_aux_list() is None:
+            vision_tower_aux_list = build_vision_tower_aux_list(model_args)
+            if model_args.unfreeze_mm_vision_tower:
+                self.vision_tower_aux_list = nn.ModuleList(vision_tower_aux_list)
+            else:
+                self.vision_tower_aux_list = vision_tower_aux_list
+        else:
+            vision_tower_aux_list = self.vision_tower_aux_list
+            for vision_tower_aux in vision_tower_aux_list:
+                vision_tower_aux.load_model()
+        self.config.use_mm_proj = True
+        self.config.mm_projector_type = getattr(
+            model_args, "mm_projector_type", "linear"
+        )
+        self.config.vision_hidden_size = vision_hidden_size
+        self.config.mm_vision_select_layer = mm_vision_select_layer
+        self.config.mm_vision_select_feature = mm_vision_select_feature
+        if getattr(self, "mm_projector", None) is None:
+            if self.config.mm_projector_type == "sva":
+                self.mm_projector = nn.Sequential(
+                    nn.Linear(
+                        vision_hidden_size * num_query_group, self.config.hidden_size
+                    ),
+                    nn.GELU(),
+                    nn.Linear(self.config.hidden_size, self.config.hidden_size),
+                )
+                for aux_i, vision_tower_aux in enumerate(vision_tower_aux_list):
+                    setattr(
+                        self,
+                        "mm_projector_aux_{}".format(aux_i),
+                        nn.Sequential(
+                            nn.Linear(vision_tower_aux.hidden_size, vision_hidden_size),
+                            nn.GELU(),
+                            nn.Linear(vision_hidden_size, vision_hidden_size),
+                            nn.LayerNorm(vision_hidden_size),
+                        ),
+                    )
+                # vision sampler for each group of query as the connector before the LLM
+                for query_group_i in range(num_query_group):
+                    cross_att_token_len_list = [
+                        int(vision_tower_aux_token_len**0.5)
+                        // int(query_num_list[query_group_i] ** 0.5)
+                        for vision_tower_aux_token_len in vision_tower_aux_token_len_list
+                    ]
+                    setattr(
+                        self,
+                        "vision_sampler_{}".format(query_group_i),
+                        VisionTokenSampler(
+                            vision_hidden_size,
+                            vision_hidden_size,
+                            [vision_hidden_size] * len(vision_tower_aux_list),
+                            cross_att_token_len_list,
+                            vision_hidden_size,
+                            connector_depth,
+                        ),
+                    )
+                # sampler layers within LLM
+                if not connector_only:
+                    num_of_vision_sampler_layers = (
+                        self.config.num_of_vision_sampler_layers
+                    ) = model_args.num_of_vision_sampler_layers
+                    self.config.start_of_vision_sampler_layers = (
+                        model_args.start_of_vision_sampler_layers
+                    )
+                    self.config.stride_of_vision_sampler_layers = (
+                        model_args.stride_of_vision_sampler_layers
+                    )
+                    cross_att_token_len_list = [
+                        int(vision_tower_aux_token_len**0.5)
+                        // int(image_token_len**0.5)
+                        for vision_tower_aux_token_len in vision_tower_aux_token_len_list
+                    ]
+                    self.vision_sampler_layers = nn.ModuleList(
+                        [
+                            VisionTokenSampler(
+                                self.config.hidden_size,
+                                vision_hidden_size,
+                                [vision_hidden_size] * len(vision_tower_aux_list),
+                                cross_att_token_len_list,
+                                vision_hidden_size,
+                                1,
+                            )
+                            for layer_idx in range(0, num_of_vision_sampler_layers)
+                        ]
+                    )
+                vision_embed_std = 1 / torch.sqrt(
+                    torch.tensor(vision_hidden_size, dtype=self.dtype)
+                )
+                self.vision_query = nn.Parameter(
+                    torch.randn((num_query_group, vision_hidden_size), dtype=self.dtype)
+                    * vision_embed_std
+                )
+                embed_std = 1 / torch.sqrt(
+                    torch.tensor(self.config.hidden_size, dtype=self.dtype)
+                )
+                self.image_newline = nn.Parameter(
+                    torch.randn(self.config.hidden_size, dtype=self.dtype) * embed_std
+                )
+            else:
+                self.config.mm_hidden_size = sum(
+                    [
+                        vision_tower_aux.hidden_size
+                        for vision_tower_aux in vision_tower_aux_list
+                    ]
+                )
+                self.mm_projector = build_vision_projector(self.config)
+                embed_std = 1 / torch.sqrt(
+                    torch.tensor(self.config.hidden_size, dtype=self.dtype)
+                )
+                self.image_newline = nn.Parameter(
+                    torch.randn(self.config.hidden_size, dtype=self.dtype) * embed_std
+                )
+        else:
+            # In case it is frozen by LoRA
+            for p in self.mm_projector.parameters():
+                p.requires_grad = True
+        if pretrain_mm_mlp_adapter is not None:
+            mm_projector_weights = torch.load(
+                pretrain_mm_mlp_adapter, map_location="cpu"
+            )
+            def get_w(weights, keyword):
+                return {
+                    k.split(keyword + ".")[1]: v
+                    for k, v in weights.items()
+                    if keyword + "." in k
+                }
+            self.mm_projector.load_state_dict(
+                get_w(mm_projector_weights, "mm_projector"), strict=True
+            )
+            if self.config.mm_projector_type == "sva":
+                for aux_i in range(len(vision_tower_aux_list)):
+                    getattr(self, "mm_projector_aux_{}".format(aux_i)).load_state_dict(
+                        get_w(
+                            mm_projector_weights, "mm_projector_aux_{}".format(aux_i)
+                        ),
+                        strict=True,
+                    )
+                for query_group_i in range(num_query_group):
+                    getattr(
+                        self, "vision_sampler_{}".format(query_group_i)
+                    ).load_state_dict(
+                        get_w(
+                            mm_projector_weights,
+                            "vision_sampler_{}".format(query_group_i),
+                        ),
+                        strict=True,
+                    )
+                if not connector_only:
+                    self.vision_sampler_layers.load_state_dict(
+                        get_w(mm_projector_weights, "vision_sampler_layers"),
+                        strict=True,
+                    )
+                self.vision_query.data = mm_projector_weights["model.vision_query"]
+            self.image_newline.data = mm_projector_weights["model.image_newline"]
+def unmask_attention_mask(mask, original_size):
+    original_w, original_h = original_size
+    cur_h, cur_w = mask.shape[1:3]
+    original_aspect_ratio = original_w / original_h
+    current_aspect_ratio = cur_w / cur_h
+    if original_aspect_ratio > current_aspect_ratio:
+        scale_factor = cur_w / original_w
+        new_height = int(original_h * scale_factor)
+        padding = (cur_h - new_height) // 2
+        if padding > 0:
+            mask[:, :padding, :] = 0
+            mask[:, -padding:, :] = 0
+        return mask
+    else:
+        scale_factor = cur_h / original_h
+        new_width = int(original_w * scale_factor)
+        padding = (cur_w - new_width) // 2
+        if padding > 0:
+            mask[:, :, :padding] = 0
+            mask[:, :, -padding:] = 0
+        return mask
+def unpad_image(tensor, original_size):
+    """
+    Unpads a PyTorch tensor of a padded and resized image.
+    Args:
+    tensor (torch.Tensor): The image tensor, assumed to be in CxHxW format.
+    original_size (tuple): The original size of the image (height, width).
+    Returns:
+    torch.Tensor: The unpadded image tensor.
+    """
+    original_width, original_height = original_size
+    current_height, current_width = tensor.shape[1:3]
+    original_aspect_ratio = original_width / original_height
+    current_aspect_ratio = current_width / current_height
+    if original_aspect_ratio > current_aspect_ratio:
+        scale_factor = current_width / original_width
+        new_height = int(original_height * scale_factor)
+        padding = (current_height - new_height) // 2
+        unpadded_tensor = tensor[:, padding : current_height - padding, :]
+        # if 0 in unpadded_tensor.shape:
+        #     print(f"scale_factor: {scale_factor}, new_height: {new_height}, padding: {padding}, original_width: {original_width}, original_height: {original_height}")
+    else:
+        scale_factor = current_height / original_height
+        new_width = int(original_width * scale_factor)
+        padding = (current_width - new_width) // 2
+        unpadded_tensor = tensor[:, :, padding : current_width - padding]
+        # if 0 in unpadded_tensor.shape:
+        #     print(f"scale_factor: {scale_factor}, new_width: {new_width}, padding: {padding}, original_width: {original_width}, original_height: {original_height}")
+    return unpadded_tensor
+class CambrianMetaForCausalLM(ABC):
+    @abstractmethod
+    def get_model(self):
+        pass
+    # def get_vision_tower(self):
+    #     return self.get_model().get_vision_tower()
+    def get_vision_tower_aux_list(self):
+        return self.get_model().get_vision_tower_aux_list()
+    def rearrange_vision_tower_features_train(
+        self,
+        vision_tower_aux_feature_list,
+        vision_tower_aux_attention_masks_list,
+        query_side_len,
+    ):
+        vision_tower_aux_feature_rearranged_list = []
+        vision_tower_aux_attention_masks_rearranged_list = []
+        bs = vision_tower_aux_feature_list[0].shape[0]
+        for vision_tower_aux_feature, vision_tower_aux_attention_masks in zip(
+            vision_tower_aux_feature_list, vision_tower_aux_attention_masks_list
+        ):
+            aux_height = aux_width = int(vision_tower_aux_feature.shape[1] ** 0.5)
+            assert (aux_height // query_side_len) * query_side_len == aux_height
+            reduce_factor = aux_height // query_side_len
+            vision_tower_aux_feature_rearranged = vision_tower_aux_feature.view(
+                bs, query_side_len, reduce_factor, query_side_len, reduce_factor, -1
+            )
+            vision_tower_aux_feature_rearranged = (
+                vision_tower_aux_feature_rearranged.permute(0, 1, 3, 2, 4, 5)
+                .contiguous()
+                .flatten(0, 2)
+                .flatten(1, 2)
+            )
+            vision_tower_aux_attention_masks_rearranged = (
+                vision_tower_aux_attention_masks.view(
+                    bs * query_side_len * query_side_len, reduce_factor * reduce_factor
+                )
+            )
+            vision_tower_aux_feature_rearranged_list.append(
+                vision_tower_aux_feature_rearranged
+            )
+            vision_tower_aux_attention_masks_rearranged_list.append(
+                vision_tower_aux_attention_masks_rearranged
+            )
+        return (
+            vision_tower_aux_feature_rearranged_list,
+            vision_tower_aux_attention_masks_rearranged_list,
+        )
+    def rearrange_vision_tower_features_inference(
+        self, vision_tower_aux_feature_list, query_side_len, image_sizes, unpad=False
+    ):
+        vision_tower_aux_feature_rearranged_list = []
+        vision_tower_aux_attention_masks_rearranged_list = []
+        bs = vision_tower_aux_feature_list[0].shape[0]
+        for vision_tower_aux_feature in vision_tower_aux_feature_list:
+            aux_height = aux_width = int(vision_tower_aux_feature.shape[1] ** 0.5)
+            assert (aux_height // query_side_len) * query_side_len == aux_height
+            reduce_factor = aux_height // query_side_len
+            vision_tower_aux_feature_rearranged = []
+            vision_tower_aux_attention_masks_rearranged = []
+            for batch_i in range(bs):
+                image_size = image_sizes[batch_i]
+                cur_vision_tower_aux_feature = vision_tower_aux_feature[batch_i]
+                cur_vision_tower_aux_attention_masks_rearranged = torch.ones(
+                    (1, aux_height, aux_width),
+                    dtype=torch.bool,
+                    device=cur_vision_tower_aux_feature.device,
+                )
+                cur_vision_tower_aux_feature_rearranged = (
+                    cur_vision_tower_aux_feature.view(
+                        1,
+                        query_side_len,
+                        reduce_factor,
+                        query_side_len,
+                        reduce_factor,
+                        -1,
+                    )
+                )
+                cur_vision_tower_aux_feature_rearranged = (
+                    cur_vision_tower_aux_feature_rearranged.permute(
+                        0, 1, 3, 2, 4, 5
+                    ).contiguous()
+                )
+                if unpad:
+                    cur_vision_tower_aux_feature_rearranged = unpad_image(
+                        cur_vision_tower_aux_feature_rearranged, image_size
+                    )
+                cur_vision_tower_aux_feature_rearranged = (
+                    cur_vision_tower_aux_feature_rearranged.flatten(0, 2).flatten(1, 2)
+                )  # query_side_len*query_side_len X reduce_factor*reduce_factor X C
+                cur_vision_tower_aux_attention_masks_rearranged = unmask_attention_mask(
+                    cur_vision_tower_aux_attention_masks_rearranged, image_size
+                )
+                cur_vision_tower_aux_attention_masks_rearranged = (
+                    cur_vision_tower_aux_attention_masks_rearranged.view(
+                        1, query_side_len, reduce_factor, query_side_len, reduce_factor
+                    )
+                    .permute(0, 1, 3, 2, 4)
+                    .contiguous()
+                )
+                if unpad:
+                    cur_vision_tower_aux_attention_masks_rearranged = unpad_image(
+                        cur_vision_tower_aux_attention_masks_rearranged, image_size
+                    )
+                cur_vision_tower_aux_attention_masks_rearranged = (
+                    cur_vision_tower_aux_attention_masks_rearranged.flatten(
+                        0, 2
+                    ).flatten(1, 2)
+                )
+                cur_vision_tower_aux_attention_masks_rearranged[
+                    cur_vision_tower_aux_attention_masks_rearranged.sum(-1) == 0
+                ] = True
+                vision_tower_aux_feature_rearranged.append(
+                    cur_vision_tower_aux_feature_rearranged
+                )
+                vision_tower_aux_attention_masks_rearranged.append(
+                    cur_vision_tower_aux_attention_masks_rearranged
+                )
+            vision_tower_aux_feature_rearranged = torch.cat(
+                vision_tower_aux_feature_rearranged, 0
+            )
+            vision_tower_aux_attention_masks_rearranged = torch.cat(
+                vision_tower_aux_attention_masks_rearranged, 0
+            )
+            vision_tower_aux_feature_rearranged_list.append(
+                vision_tower_aux_feature_rearranged
+            )
+            vision_tower_aux_attention_masks_rearranged_list.append(
+                vision_tower_aux_attention_masks_rearranged
+            )
+        return (
+            vision_tower_aux_feature_rearranged_list,
+            vision_tower_aux_attention_masks_rearranged_list,
+        )
+    def encode_images(self, image_aux_list, encode_type=None):
+        vision_tower_aux_list = self.get_model().get_vision_tower_aux_list()
+        image_aux_features_list = []
+        chunk_size = 64
+        if encode_type == "dino":
+            image_aux = image_aux_list[-1]
+            vision_tower_aux = vision_tower_aux_list[-1]
+            if image_aux.shape[0] > chunk_size:
+                image_aux_features_chunks = []
+                for start_idx in range(0, image_aux.shape[0], chunk_size):
+                    end_idx = min(start_idx + chunk_size, image_aux.shape[0])
+                    chunk = image_aux[start_idx:end_idx]
+                    image_aux_features_chunk = vision_tower_aux(chunk)
+                    image_aux_features_chunks.append(image_aux_features_chunk)
+                image_aux_features = torch.cat(image_aux_features_chunks, dim=0)
+            else:
+                image_aux_features = vision_tower_aux(image_aux)
+            return image_aux_features
+        elif encode_type == "siglip":
+            image_aux = image_aux_list[0]
+            vision_tower_aux = vision_tower_aux_list[0]
+            if image_aux.shape[0] > chunk_size:
+                image_aux_features_chunks = []
+                for start_idx in range(0, image_aux.shape[0], chunk_size):
+                    end_idx = min(start_idx + chunk_size, image_aux.shape[0])
+                    chunk = image_aux[start_idx:end_idx]
+                    image_aux_features_chunk = vision_tower_aux(chunk)
+                    image_aux_features_chunks.append(image_aux_features_chunk)
+                image_aux_features = torch.cat(image_aux_features_chunks, dim=0)
+            else:
+                image_aux_features = vision_tower_aux(image_aux)
+            return image_aux_features
+        else:
+            for image_aux, vision_tower_aux in zip(
+                image_aux_list, vision_tower_aux_list
+            ):
+                if image_aux.shape[0] > chunk_size:
+                    image_aux_features_chunks = []
+                    for start_idx in range(0, image_aux.shape[0], chunk_size):
+                        end_idx = min(start_idx + chunk_size, image_aux.shape[0])
+                        chunk = image_aux[start_idx:end_idx]
+                        image_aux_features_chunk = vision_tower_aux(chunk)
+                        image_aux_features_chunks.append(image_aux_features_chunk)
+                    image_aux_features = torch.cat(image_aux_features_chunks, dim=0)
+                else:
+                    image_aux_features = vision_tower_aux(image_aux)
+                image_aux_features_list.append(image_aux_features)
+            return image_aux_features_list
+    def select_frame(
+        self,
+        feature_list,
+        split_sizes,
+        input_ids,
+        new_image_aux_list,
+        image_sizes,
+        window_size=16,
+        threshold=0.83,
+    ):
+        dino_features_batch = torch.split(feature_list, split_sizes, dim=0)
+        new_image_aux_batch_0 = torch.split(new_image_aux_list[0], split_sizes, dim=0)
+        new_image_aux_batch_1 = torch.split(new_image_aux_list[1], split_sizes, dim=0)
+        new_split_sizes = []
+        selected_frames_all_0 = []
+        selected_frames_all_1 = []
+        selected_frames_feature_all = []
+        selected_frame_indices_all = []
+        for i_batch, frame_features in enumerate(dino_features_batch):
+            try:
+                if "llama" in self.get_model().config.model_type:
+                    text_len = torch.where(input_ids[i_batch] == 128002)[-1][0]
+                else:
+                    text_len = torch.where(input_ids[i_batch] == 151643)[-1][0]
+            except:
+                text_len = len(input_ids[i_batch])
+            original_width, original_height = image_sizes[i_batch]
+            if getattr(self.get_model().config, "highres", False):
+                token_per_frame = self.get_model().config.lowres_token ** 2
+            else:
+                token_per_frame = self.get_model().config.image_token_len
+            # current_height, current_width = token_per_side, token_per_side
+            # original_aspect_ratio = original_width / original_height
+            # current_aspect_ratio = current_width / current_height
+            # if original_aspect_ratio > current_aspect_ratio:
+            #     scale_factor = current_width / original_width
+            #     new_height = int(original_height * scale_factor)
+            #     padding = math.ceil((current_height - new_height) / 2.0)
+            #     token_per_frame = (
+            #         current_height - padding * 2
+            #     ) * token_per_side + token_per_side
+            # else:
+            #     scale_factor = current_height / original_height
+            #     new_width = int(original_width * scale_factor)
+            #     padding = math.ceil((current_width - new_width) / 2.0)
+            #     token_per_frame = (current_width - padding * 2) * token_per_side + (
+            #         current_width - padding * 2
+            #     )
+            # token_per_frame = (
+            #     token_per_side**2 if token_per_frame < 1 else token_per_frame
+            # )
+            max_num_frames = max(
+                1,
+                (
+                    self.get_model().config.tokenizer_model_max_length
+                    - text_len
+                    - getattr(self.get_model().config, "inference_max_length", 16)
+                )
+                // token_per_frame,
+            )
+            if len(frame_features) < max_num_frames:
+                selected_frames_all_0.append(new_image_aux_batch_0[i_batch])
+                selected_frames_all_1.append(new_image_aux_batch_1[i_batch])
+                selected_frames_feature_all.append(frame_features)
+                new_split_sizes.append(len(frame_features))
+                selected_frame_indices_all.append(torch.arange(len(frame_features)))
+                continue
+            num_segments = len(frame_features) // window_size
+            if num_segments == 0:
+                query_feature = frame_features.flatten(1, 2)
+                query_feature = query_feature / torch.norm(
+                    (query_feature), dim=1, keepdim=True
+                )
+                similarities = torch.mean(query_feature @ query_feature.T, dim=1)
+                similarities[len(frame_features) // 2] = 0
+                indices = torch.where(similarities < threshold)[0]
+                selected_frame_indices_all.append(indices)
+                selected_frames_all_0.append(new_image_aux_batch_0[i_batch][indices])
+                selected_frames_all_1.append(new_image_aux_batch_1[i_batch][indices])
+                selected_frames_feature_all.append(frame_features[indices])
+                new_split_sizes.append(len(indices))
+                continue
+            segments_frames_0 = []
+            segments_frames_1 = []
+            segments_features = []
+            for start_idx in range(0, len(frame_features), window_size):
+                end_idx = min(start_idx + window_size, len(frame_features))
+                segments_frames_0.append(
+                    new_image_aux_batch_0[i_batch][start_idx:end_idx]
+                )
+                segments_frames_1.append(
+                    new_image_aux_batch_1[i_batch][start_idx:end_idx]
+                )
+                segments_features.append(frame_features[start_idx:end_idx])
+            selected_frames_0 = []
+            selected_frames_1 = []
+            selected_features = []
+            selected_frame_indices = []
+            for i, segment in enumerate(segments_features):
+                query_feature = segment.flatten(1, 2)
+                query_feature = query_feature / torch.norm(
+                    (query_feature), dim=1, keepdim=True
+                )
+                similarities = torch.mean(query_feature @ query_feature.T, dim=1)
+                similarities[len(segment) // 2] = 0
+                indices = torch.where(similarities < threshold)[0]
+                selected_frames_0.append(segments_frames_0[i][indices])
+                selected_frames_1.append(segments_frames_1[i][indices])
+                selected_features.append(segment[indices])
+                selected_frame_indices.extend(indices + i * window_size)
+            selected_frames_0 = torch.cat(selected_frames_0, dim=0)
+            selected_frames_1 = torch.cat(selected_frames_1, dim=0)
+            selected_features = torch.cat(selected_features, dim=0)
+            selected_frame_indices = torch.tensor(selected_frame_indices)
+            # ablation
+            max_num_frames = 400  # in case of OOM
+            if len(selected_frames_0) > max_num_frames:
+                interval = len(selected_frames_0) / float(max_num_frames)
+                indices = [int(interval * i) for i in range(max_num_frames)]
+                new_split_sizes.append(len(indices))
+                selected_frames_all_0.append(selected_frames_0[indices])
+                selected_frames_all_1.append(selected_frames_1[indices])
+                selected_frames_feature_all.append(selected_features[indices])
+                selected_frame_indices = selected_frame_indices[indices]
+            else:
+                new_split_sizes.append(len(selected_frames_0))
+                selected_frames_all_0.append(selected_frames_0)
+                selected_frames_all_1.append(selected_frames_1)
+                selected_frames_feature_all.append(selected_features)
+            selected_frame_indices_all.append(selected_frame_indices)
+        selected_frames_all_0 = torch.cat(selected_frames_all_0, dim=0)
+        selected_frames_all_1 = torch.cat(selected_frames_all_1, dim=0)
+        selected_frames_feature_all = torch.cat(selected_frames_feature_all, dim=0)
+        return (
+            selected_frames_feature_all,
+            new_split_sizes,
+            [selected_frames_all_0, selected_frames_all_1],
+            selected_frame_indices_all,
+        )
+    def prepare_inputs_labels_for_multimodal(
+        self,
+        input_ids,
+        position_ids,
+        attention_mask,
+        past_key_values,
+        labels,
+        images,
+        image_aux_attention_masks_list=None,
+        image_sizes=None,
+    ):
+        # vision_tower = self.get_vision_tower()
+        vision_tower_aux_list = self.get_model().get_vision_tower_aux_list()
+        if vision_tower_aux_list is None or images is None or input_ids.shape[1] == 1:
+            return (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                None,
+                labels,
+                None,
+                None,
+                None,
+                None,
+            )
+        image_aux_list = images
+        split_sizes = None
+        if type(image_aux_list[0]) is list or image_aux_list[0].ndim == 5:
+            split_sizes_ori = [
+                1 if image.ndim == 3 else image.shape[0] for image in image_aux_list[0]
+            ]
+            new_image_aux_list = []
+            for image_aux in image_aux_list:
+                if type(image_aux) is list:
+                    image_aux = [
+                        x.unsqueeze(0) if x.ndim == 3 else x for x in image_aux
+                    ]
+                concat_image_aux = torch.cat([image for image in image_aux], dim=0)
+                new_image_aux_list.append(concat_image_aux)
+            image_aux_features_dino = self.encode_images(
+                new_image_aux_list, encode_type="dino"
+            )
+            (
+                image_aux_features_dino,
+                split_sizes,
+                new_image_aux_list,
+                selected_frame_indices_all,
+            ) = self.select_frame(
+                image_aux_features_dino,
+                split_sizes_ori,
+                input_ids,
+                new_image_aux_list,
+                image_sizes,
+                threshold=getattr(self.get_model().config, "dino_threshold", 0.83),
+            )
+            image_aux_features_siglip = self.encode_images(
+                new_image_aux_list, encode_type="siglip"
+            )
+            image_aux_features_list = [
+                image_aux_features_siglip,
+                image_aux_features_dino,
+            ]
+            bs = image_aux_features_list[0].shape[0]
+            dtype = new_image_aux_list[0].dtype
+            frame_sizes = []
+            for i in range(len(image_sizes)):
+                for j in range(split_sizes[i]):
+                    frame_sizes.append(image_sizes[i])
+            image_sizes = frame_sizes
+        else:
+            image_aux_features_list = self.encode_images(image_aux_list)
+            bs = image_aux_list[0].shape[0]
+            dtype = image_aux_list[0].dtype
+        image_token_len = self.get_model().config.image_token_len
+        query_num_list = self.get_model().config.query_num_list
+        final_height = final_width = int(image_token_len**0.5)
+        final_image_features_list = []
+        final_image_features_down_list = []
+        # only needed for sva
+        vision_tower_aux_feature_list_final = None
+        vision_tower_aux_attention_masks_list_final = None
+        global_context_feature_final = None
+        if self.get_model().config.mm_projector_type == "sva":
+            vision_tower_aux_feature_list = []
+            vision_tower_aux_attention_masks_list = []
+            # get vision tokens from each vision tower
+            for aux_i in range(len(vision_tower_aux_list)):
+                image_aux_features = image_aux_features_list[aux_i]
+                image_aux_features = getattr(
+                    self.get_model(), "mm_projector_aux_{}".format(aux_i)
+                )(image_aux_features).to(dtype)
+                if aux_i == 0:
+                    global_context_feature = image_aux_features.mean(1).view(
+                        bs, 1, 1, -1
+                    )
+                vision_tower_aux_feature_list.append(image_aux_features)
+            input_mix_res = True
+            input_high_res = True
+            # perform vision sampling for each query group
+            for query_group_i, query_num in enumerate(query_num_list):
+                query_features_i = (
+                    self.get_model()
+                    .vision_query[query_group_i, :]
+                    .view(1, 1, 1, -1)
+                    .expand(bs, query_num, -1, -1)
+                )
+                global_context_feature_i = global_context_feature.expand(
+                    -1, query_num, 1, -1
+                ).flatten(0, 1)
+                query_side_len = int(query_num**0.5)
+                if IS_XLA_AVAILABLE:
+                    (
+                        vision_tower_aux_feature_list_i,
+                        vision_tower_aux_attention_masks_list_i,
+                    ) = self.rearrange_vision_tower_features_train(
+                        vision_tower_aux_feature_list,
+                        image_aux_attention_masks_list,
+                        query_side_len,
+                    )
+                else:
+                    (
+                        vision_tower_aux_feature_list_i,
+                        vision_tower_aux_attention_masks_list_i,
+                    ) = self.rearrange_vision_tower_features_inference(
+                        vision_tower_aux_feature_list, query_side_len, image_sizes
+                    )
+                query_features_i = getattr(
+                    self.get_model(), "vision_sampler_{}".format(query_group_i)
+                )(
+                    query_features_i.flatten(0, 1),
+                    global_context_feature_i,
+                    *vision_tower_aux_feature_list_i,
+                    *vision_tower_aux_attention_masks_list_i,
+                )
+                query_features_i = query_features_i.view(bs, query_num, -1)
+                if split_sizes is not None:
+                    try:
+                        if "llama" in self.get_model().config.model_type:
+                            text_len = torch.where(input_ids[0] == 128002)[-1][0]
+                        else:
+                            text_len = torch.where(input_ids[0] == 151643)[-1][0]
+                    except:
+                        text_len = len(input_ids[0])
+                    max_visual_len = (
+                        self.get_model().config.tokenizer_model_max_length
+                        - text_len
+                        - getattr(self.get_model().config, "inference_max_length", 16)
+                    )
+                    max_num_frames = max(
+                        1,
+                        math.floor(max_visual_len // (final_height * final_width)),
+                    )
+                    max_num_frames_low = max(
+                        1,
+                        math.floor(
+                            max_visual_len
+                            // (self.get_model().config.lowres_token ** 2)
+                        ),
+                    )
+                    if split_sizes[0] < max_num_frames:
+                        input_mix_res = False
+                    elif split_sizes[0] > max_num_frames_low:
+                        input_mix_res = False
+                        input_high_res = False
+                # input_mix_res = False  # ablation
+                if (getattr(self.config, "highres", False)) and input_mix_res:
+                    _query_features_i = (
+                        query_features_i.permute(0, 2, 1)
+                        .contiguous()
+                        .view(bs, -1, query_side_len, query_side_len)
+                    )
+                    _query_features_i = F.interpolate(
+                        _query_features_i.float(),
+                        size=(
+                            self.get_model().config.lowres_token,
+                            self.get_model().config.lowres_token,
+                        ),
+                        mode="bilinear",
+                        align_corners=False,
+                    ).to(dtype=query_features_i.dtype)
+                    _query_features_i = (
+                        _query_features_i.permute(0, 2, 3, 1).contiguous().flatten(1, 2)
+                    )
+                    final_image_features_down_list.append(_query_features_i)
+                # interpolate to the final target size
+                if query_side_len != final_height:
+                    query_features_i = (
+                        query_features_i.permute(0, 2, 1)
+                        .contiguous()
+                        .view(bs, -1, query_side_len, query_side_len)
+                    )
+                    if input_high_res:
+                        query_features_i = F.interpolate(
+                            query_features_i.float(),
+                            size=(final_height, final_width),
+                            mode="bilinear",
+                            align_corners=False,
+                        ).to(dtype=query_features_i.dtype)
+                    else:
+                        query_features_i = F.interpolate(
+                            query_features_i.float(),
+                            size=(8, 8),
+                            mode="bilinear",
+                            align_corners=False,
+                        ).to(dtype=query_features_i.dtype)
+                    query_features_i = (
+                        query_features_i.permute(0, 2, 3, 1).contiguous().flatten(1, 2)
+                    )
+                final_image_features_list.append(query_features_i)
+            if IS_XLA_AVAILABLE:
+                (
+                    vision_tower_aux_feature_list_final,
+                    vision_tower_aux_attention_masks_list_final,
+                ) = self.rearrange_vision_tower_features_train(
+                    vision_tower_aux_feature_list,
+                    image_aux_attention_masks_list,
+                    final_height,
+                )
+                global_context_feature_final = global_context_feature.expand(
+                    -1, final_height * final_width, 1, -1
+                ).flatten(0, 1)
+        else:
+            final_image_features_list = image_aux_features_list
+        image_features = torch.cat(final_image_features_list, -1)
+        image_features = self.get_model().mm_projector(image_features).to(dtype)
+        if (getattr(self.config, "highres", False)) and input_mix_res:
+            image_features_down = torch.cat(final_image_features_down_list, -1)
+            image_features_down = (
+                self.get_model().mm_projector(image_features_down).to(dtype)
+            )
+        if IS_XLA_AVAILABLE:
+            image_features = image_features.view(
+                image_features.shape[0], final_height, final_width, -1
+            )
+            image_features = torch.cat(
+                (
+                    image_features,
+                    self.model.image_newline[None, None, None, :].expand(
+                        image_features.shape[0], final_height, 1, -1
+                    ),
+                ),
+                dim=2,
+            )
+            image_features = image_features.flatten(1, 2)
+            final_size = [(final_height, final_width)] * bs
+        else:
+            image_features = image_features.view(bs, final_height, final_width, -1)
+            if (getattr(self.config, "highres", False)) and input_mix_res:
+                image_features_down = image_features_down.view(
+                    bs,
+                    self.get_model().config.lowres_token,
+                    self.get_model().config.lowres_token,
+                    -1,
+                )
+            image_features_unpadded = []
+            image_features_downsample = []
+            final_size = []
+            if self.get_model().config.mm_projector_type == "sva":
+                (
+                    vision_tower_aux_feature_list_final,
+                    vision_tower_aux_attention_masks_list_final,
+                ) = self.rearrange_vision_tower_features_inference(
+                    vision_tower_aux_feature_list, final_height, image_sizes, unpad=True
+                )
+                global_context_feature_final = []
+            for batch_i in range(bs):
+                cur_image_feature = image_features[batch_i]
+                image_size = image_sizes[batch_i]
+                cur_image_feature = unpad_image(
+                    cur_image_feature.unsqueeze(0), image_size
+                )
+                cur_h, cur_w = cur_image_feature.shape[1:3]
+                try:  # fix bug for some invalid image
+                    cur_image_feature = cur_image_feature.view(1, cur_h, cur_w, -1)
+                    final_size.append((cur_h, cur_w))
+                except:
+                    # print(f"invalid after unpad {image_features[batch_i].shape}, {image_sizes[batch_i]}", flush=True)
+                    cur_image_feature = image_features[batch_i].unsqueeze(0)
+                    image_size = image_sizes[batch_i]
+                    cur_h, cur_w = cur_image_feature.shape[1:3]
+                    cur_image_feature = cur_image_feature.view(1, cur_h, cur_w, -1)
+                    final_size.append((cur_h, cur_w))
+                if (getattr(self.config, "highres", False)) and input_mix_res:
+                    cur_image_feature_down = unpad_image(
+                        image_features_down[batch_i].unsqueeze(0),
+                        (
+                            int(
+                                image_size[0]
+                                / (
+                                    image_token_len**0.5
+                                    / self.get_model().config.lowres_token
+                                )
+                            ),
+                            int(
+                                image_size[1]
+                                / (
+                                    image_token_len**0.5
+                                    / self.get_model().config.lowres_token
+                                )
+                            ),
+                        ),
+                    )
+                    _cur_h, _cur_w = cur_image_feature_down.shape[1:3]
+                    try:  # fix bug for some invalid image
+                        cur_image_feature_down = cur_image_feature_down.view(
+                            1, _cur_h, _cur_w, -1
+                        )
+                    except:
+                        print("invalid after unpad", flush=True)
+                        cur_image_feature_down = image_features_down[batch_i].unsqueeze(
+                            0
+                        )
+                        _cur_h, _cur_w = cur_image_feature_down.shape[1:3]
+                        cur_image_feature_down = cur_image_feature_down.view(
+                            1, _cur_h, _cur_w, -1
+                        )
+                    cur_image_feature_down = torch.cat(
+                        (
+                            cur_image_feature_down,
+                            self.model.image_newline.view(1, 1, 1, -1)
+                            .expand(1, _cur_h, 1, -1)
+                            .to(cur_image_feature_down.device),
+                        ),
+                        dim=2,
+                    ).flatten(1, 2)
+                    if split_sizes is None and getattr(self.config, "frame_pos", False):
+                        frame_pos = (
+                            self.get_model()
+                            .get_frame_pos(torch.arange(1))
+                            .to(cur_image_feature_down.device)
+                            .to(cur_image_feature_down.dtype)
+                        )
+                        cur_image_feature_down += frame_pos
+                    image_features_downsample.append(cur_image_feature_down.squeeze(0))
+                cur_image_feature = torch.cat(
+                    (
+                        cur_image_feature,
+                        self.model.image_newline.view(1, 1, 1, -1)
+                        .expand(1, cur_h, 1, -1)
+                        .to(cur_image_feature.device),
+                    ),
+                    dim=2,
+                )
+                if split_sizes is None and getattr(self.config, "frame_pos", False):
+                    frame_pos = (
+                        self.get_model()
+                        .get_frame_pos(torch.arange(1))
+                        .to(cur_image_feature.device)
+                        .to(cur_image_feature.dtype)
+                    )
+                    cur_image_feature += frame_pos
+                cur_image_feature = cur_image_feature.flatten(1, 2)
+                image_features_unpadded.append(cur_image_feature.squeeze(0))
+                if self.get_model().config.mm_projector_type == "sva":
+                    cur_global_context_feature = global_context_feature[batch_i].expand(
+                        cur_h * cur_w, 1, -1
+                    )
+                    global_context_feature_final.append(cur_global_context_feature)
+            if self.get_model().config.mm_projector_type == "sva":
+                global_context_feature_final = torch.cat(
+                    global_context_feature_final, 0
+                )
+            if (getattr(self.config, "highres", False)) and input_mix_res:
+                image_features = image_features_downsample
+            else:
+                image_features = image_features_unpadded
+        # TODO: image start / end is not implemented here to support pretraining.
+        if getattr(self.config, "tune_mm_mlp_adapter", False) and getattr(
+            self.config, "mm_use_im_start_end", False
+        ):
+            raise NotImplementedError
+        split_image_features_unpadded = None
+        frame_split_sizes = None
+        if split_sizes is not None:
+            split_image_features = []
+            split_image_features_unpadded = (
+                []
+                if (getattr(self.config, "highres", False)) and input_mix_res
+                else None
+            )
+            start_idx = 0
+            for split_batch_idx, split_size in enumerate(split_sizes):
+                if isinstance(image_features[start_idx : start_idx + split_size], list):
+                    if getattr(self.config, "frame_pos", False):
+                        frame_feature = torch.cat(
+                            image_features[start_idx : start_idx + split_size], dim=0
+                        ).reshape(split_size, -1, image_features[0].shape[-1])
+                        frame_pos = (
+                            self.get_model()
+                            .get_frame_pos(selected_frame_indices_all[split_batch_idx])
+                            .to(frame_feature.device)
+                            .to(frame_feature.dtype)
+                        )
+                        frame_feature += frame_pos
+                        split_image_features.append(
+                            frame_feature.reshape(-1, image_features[0].shape[-1])
+                        )
+                    else:
+                        split_image_features.append(
+                            torch.cat(
+                                image_features[start_idx : start_idx + split_size],
+                                dim=0,
+                            )
+                        )
+                    if (getattr(self.config, "highres", False)) and input_mix_res:
+                        if getattr(self.config, "frame_pos", False):
+                            frame_feature = torch.cat(
+                                image_features_unpadded[
+                                    start_idx : start_idx + split_size
+                                ],
+                                dim=0,
+                            ).reshape(split_size, -1, image_features[0].shape[-1])
+                            frame_pos = (
+                                self.get_model()
+                                .get_frame_pos(
+                                    selected_frame_indices_all[split_batch_idx]
+                                )
+                                .to(frame_feature.device)
+                                .to(frame_feature.dtype)
+                            )
+                            frame_feature += frame_pos
+                            split_image_features_unpadded.append(
+                                frame_feature.reshape(-1, image_features[0].shape[-1])
+                            )
+                        else:
+                            split_image_features_unpadded.append(
+                                torch.cat(
+                                    image_features_unpadded[
+                                        start_idx : start_idx + split_size
+                                    ],
+                                    dim=0,
+                                )
+                            )
+                else:
+                    if getattr(self.config, "frame_pos", False):
+                        frame_feature = image_features[
+                            start_idx : start_idx + split_size
+                        ].reshape(split_size, -1, image_features[0].shape[-1])
+                        frame_pos = (
+                            self.get_model()
+                            .get_frame_pos(selected_frame_indices_all[split_batch_idx])
+                            .to(frame_feature.device)
+                            .to(frame_feature.dtype)
+                        )
+                        frame_feature += frame_pos
+                        split_image_features.append(
+                            frame_feature.reshape(-1, image_features[0].shape[-1])
+                        )
+                    else:
+                        split_image_features.append(
+                            image_features[start_idx : start_idx + split_size]
+                        )
+                    if (getattr(self.config, "highres", False)) and input_mix_res:
+                        if getattr(self.config, "frame_pos", False):
+                            frame_feature = image_features_unpadded[
+                                start_idx : start_idx + split_size
+                            ]
+                            frame_pos = (
+                                self.get_model()
+                                .get_frame_pos(
+                                    selected_frame_indices_all[split_batch_idx]
+                                )
+                                .to(frame_feature.device)
+                                .to(frame_feature.dtype)
+                            )
+                            frame_feature += frame_pos
+                            split_image_features_unpadded.append(
+                                frame_feature.reshape(-1, image_features[0].shape[-1])
+                            )
+                        else:
+                            split_image_features_unpadded.append(
+                                image_features_unpadded[
+                                    start_idx : start_idx + split_size
+                                ]
+                            )
+                start_idx += split_size
+            image_features = split_image_features
+            frame_split_sizes = split_sizes
+        _labels = labels
+        _position_ids = position_ids
+        _attention_mask = attention_mask
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+        else:
+            attention_mask = attention_mask.bool()
+        if position_ids is None:
+            position_ids = torch.arange(
+                0, input_ids.shape[1], dtype=torch.long, device=input_ids.device
+            )
+        if labels is None:
+            labels = torch.full_like(input_ids, IGNORE_INDEX)
+        # remove the padding using attention_mask -- FIXME
+        _input_ids = input_ids
+        attention_mask = attention_mask | (input_ids == IMAGE_TOKEN_INDEX)
+        input_ids = [
+            cur_input_ids[cur_attention_mask]
+            for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)
+        ]
+        labels = [
+            cur_labels[cur_attention_mask]
+            for cur_labels, cur_attention_mask in zip(labels, attention_mask)
+        ]
+        new_input_embeds = []
+        new_labels = []
+        image_token_indices_batch = []
+        cur_image_idx = 0
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
+            if num_images == 0:
+                cur_image_features = image_features[cur_image_idx]
+                cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids)
+                cur_input_embeds = torch.cat(
+                    [cur_input_embeds_1, cur_image_features[0:0]], dim=0
+                )
+                new_input_embeds.append(cur_input_embeds)
+                new_labels.append(labels[batch_idx])
+                cur_image_idx += 1
+                continue
+            image_token_indices = (
+                [-1]
+                + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist()
+                + [cur_input_ids.shape[0]]
+            )
+            image_token_indices_batch.append(
+                torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist()[0]
+            )
+            cur_input_ids_noim = []
+            cur_labels = labels[batch_idx]
+            cur_labels_noim = []
+            for i in range(len(image_token_indices) - 1):
+                cur_input_ids_noim.append(
+                    cur_input_ids[
+                        image_token_indices[i] + 1 : image_token_indices[i + 1]
+                    ]
+                )
+                cur_labels_noim.append(
+                    cur_labels[image_token_indices[i] + 1 : image_token_indices[i + 1]]
+                )
+            split_sizes = [x.shape[0] for x in cur_labels_noim]
+            cur_input_embeds = self.get_model().embed_tokens(
+                torch.cat(cur_input_ids_noim)
+            )
+            cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
+            cur_new_input_embeds = []
+            cur_new_labels = []
+            text_len = sum([x.shape[0] for x in cur_input_embeds_no_im])
+            visual_len = len(image_features[cur_image_idx])
+            max_visual_len = (
+                self.get_model().config.tokenizer_model_max_length
+                - getattr(self.get_model().config, "inference_max_length", 16)
+                - text_len
+            )
+            mix_token = False
+            # ablation mix
+            if (
+                input_mix_res
+                and (
+                    self.get_model().config.image_token_len
+                    > getattr(self.get_model().config, "lowres_token", 8) ** 2
+                )
+                and frame_split_sizes is not None
+                and getattr(self.config, "highres", False)
+            ):
+                if max_visual_len > visual_len:
+                    visual_emb = image_features[cur_image_idx]
+                    text_emb = cur_input_embeds_no_im[-1]
+                    highres_num = math.floor(
+                        (max_visual_len - visual_len)
+                        / (
+                            split_image_features_unpadded[cur_image_idx].shape[0]
+                            // frame_split_sizes[cur_image_idx]
+                            - visual_emb.shape[0] // frame_split_sizes[cur_image_idx]
+                        )
+                    )
+                    if highres_num >= 1:
+                        mix_token = True
+                        sim = torch.matmul(visual_emb, text_emb.transpose(0, 1)).mean(
+                            dim=-1
+                        )
+                        sim_frame = sim.reshape(
+                            frame_split_sizes[cur_image_idx], -1
+                        ).mean(dim=-1)
+                        highres_num = min(highres_num, sim_frame.shape[0])
+                        top_values, top_indices = torch.topk(sim_frame, highres_num)
+                        if len(top_indices) > 0:
+                            sorted_indices = torch.sort(top_indices)[1]
+                            top_indices = top_indices[sorted_indices]
+                            visual_emb_frame = image_features[cur_image_idx].reshape(
+                                frame_split_sizes[cur_image_idx],
+                                -1,
+                                image_features[cur_image_idx].shape[-1],
+                            )
+                            visual_emb_frame_highres = split_image_features_unpadded[
+                                cur_image_idx
+                            ].reshape(
+                                frame_split_sizes[cur_image_idx],
+                                -1,
+                                split_image_features_unpadded[cur_image_idx].shape[-1],
+                            )
+                            current_point = 0
+                            mix_visual_emb_frame = []
+                            for frame_i in range(len(visual_emb_frame)):
+                                if current_point > len(top_indices) - 1:
+                                    mix_visual_emb_frame.append(
+                                        visual_emb_frame[frame_i]
+                                    )
+                                    continue
+                                if frame_i == top_indices[current_point]:
+                                    mix_visual_emb_frame.append(
+                                        visual_emb_frame_highres[frame_i]
+                                    )
+                                    current_point += 1
+                                else:
+                                    mix_visual_emb_frame.append(
+                                        visual_emb_frame[frame_i]
+                                    )
+                            image_features[cur_image_idx] = torch.cat(
+                                mix_visual_emb_frame, dim=0
+                            )
+            # ablation drop
+            if (
+                max_visual_len < visual_len
+                and frame_split_sizes is not None
+                and not mix_token
+            ):
+                visual_emb_frame = image_features[cur_image_idx].reshape(
+                    frame_split_sizes[cur_image_idx],
+                    -1,
+                    image_features[cur_image_idx].shape[-1],
+                )
+                sim = F.cosine_similarity(
+                    visual_emb_frame[:-1],
+                    visual_emb_frame[1:],
+                    dim=-1,
+                )
+                new_visual_emb_frames = []
+                for start_idx in range(0, len(visual_emb_frame), 8):
+                    end_idx = min(start_idx + 8, len(visual_emb_frame))
+                    chunk_feature = visual_emb_frame[start_idx:end_idx]  # 8, HW, C
+                    if len(chunk_feature) == 1:
+                        new_visual_emb_frames.append(chunk_feature[0])
+                        continue
+                    sim = F.cosine_similarity(
+                        chunk_feature[0]
+                        .unsqueeze(0)
+                        .repeat_interleave(len(chunk_feature[1:]), dim=0),
+                        chunk_feature[1:],
+                        dim=-1,
+                    )
+                    new_visual_emb_frame = torch.cat(
+                        [
+                            chunk_feature[0],
+                            chunk_feature[1:].flatten(0, 1)[
+                                sim.flatten(0, 1)
+                                < getattr(
+                                    self.get_model().config, "drop_threshold", 0.7
+                                )
+                            ],
+                        ],
+                        dim=0,
+                    )
+                    new_visual_emb_frames.append(new_visual_emb_frame)
+                reduced_visual_len = sum([x.shape[0] for x in new_visual_emb_frames])
+                if reduced_visual_len > max_visual_len:
+                    force_remove = math.ceil(
+                        (reduced_visual_len - max_visual_len)
+                        / len(new_visual_emb_frames)
+                    )
+                    for chunk_i in range(len(new_visual_emb_frames)):
+                        new_visual_emb_frames[chunk_i] = new_visual_emb_frames[chunk_i][
+                            :-force_remove
+                        ]
+                    new_visual_emb_frames = torch.cat(new_visual_emb_frames, dim=0)
+                else:
+                    new_visual_emb_frames = torch.cat(new_visual_emb_frames, dim=0)
+                image_features[cur_image_idx] = new_visual_emb_frames[:max_visual_len]
+            for i in range(num_images + 1):
+                cur_new_input_embeds.append(cur_input_embeds_no_im[i])
+                cur_new_labels.append(cur_labels_noim[i])
+                if i < num_images:
+                    cur_image_features = image_features[cur_image_idx]
+                    cur_image_idx += 1
+                    cur_new_input_embeds.append(cur_image_features)
+                    cur_new_labels.append(
+                        torch.full(
+                            (cur_image_features.shape[0],),
+                            IGNORE_INDEX,
+                            device=cur_labels.device,
+                            dtype=cur_labels.dtype,
+                        )
+                    )
+            cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds]
+            cur_new_input_embeds = torch.cat(cur_new_input_embeds)
+            cur_new_labels = torch.cat(cur_new_labels)
+            new_input_embeds.append(cur_new_input_embeds)
+            new_labels.append(cur_new_labels)
+        # Truncate sequences to max length as image embeddings can make the sequence longer
+        tokenizer_model_max_length = getattr(
+            self.config, "tokenizer_model_max_length", None
+        )
+        if tokenizer_model_max_length is not None:
+            new_input_embeds = [
+                x[:tokenizer_model_max_length] for x in new_input_embeds
+            ]
+            new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
+        # Combine them
+        max_len = max(x.shape[0] for x in new_input_embeds)
+        batch_size = len(new_input_embeds)
+        new_input_embeds_padded = []
+        new_labels_padded = torch.full(
+            (batch_size, max_len),
+            IGNORE_INDEX,
+            dtype=new_labels[0].dtype,
+            device=new_labels[0].device,
+        )
+        attention_mask = torch.zeros(
+            (batch_size, max_len),
+            dtype=attention_mask.dtype,
+            device=attention_mask.device,
+        )
+        position_ids = torch.zeros(
+            (batch_size, max_len),
+            dtype=position_ids.dtype,
+            device=position_ids.device,
+        )
+        for i, (cur_new_embed, cur_new_labels) in enumerate(
+            zip(new_input_embeds, new_labels)
+        ):
+            cur_len = cur_new_embed.shape[0]
+            if getattr(self.config, "tokenizer_padding_side", "right") == "left":
+                new_input_embeds_padded.append(
+                    torch.cat(
+                        (
+                            torch.zeros(
+                                (max_len - cur_len, cur_new_embed.shape[1]),
+                                dtype=cur_new_embed.dtype,
+                                device=cur_new_embed.device,
+                            ),
+                            cur_new_embed,
+                        ),
+                        dim=0,
+                    )
+                )
+                if cur_len > 0:
+                    new_labels_padded[i, -cur_len:] = cur_new_labels
+                    attention_mask[i, -cur_len:] = True
+                    position_ids[i, -cur_len:] = torch.arange(
+                        0,
+                        cur_len,
+                        dtype=position_ids.dtype,
+                        device=position_ids.device,
+                    )
+            else:
+                new_input_embeds_padded.append(
+                    torch.cat(
+                        (
+                            cur_new_embed,
+                            torch.zeros(
+                                (max_len - cur_len, cur_new_embed.shape[1]),
+                                dtype=cur_new_embed.dtype,
+                                device=cur_new_embed.device,
+                            ),
+                        ),
+                        dim=0,
+                    )
+                )
+                if cur_len > 0:
+                    new_labels_padded[i, :cur_len] = cur_new_labels
+                    attention_mask[i, :cur_len] = True
+                    position_ids[i, :cur_len] = torch.arange(
+                        0,
+                        cur_len,
+                        dtype=position_ids.dtype,
+                        device=position_ids.device,
+                    )
+        new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
+        if _labels is None:
+            new_labels = None
+        else:
+            new_labels = new_labels_padded
+        if _attention_mask is None:
+            attention_mask = None
+        else:
+            attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
+        if _position_ids is None:
+            position_ids = None
+        return (
+            None,
+            position_ids,
+            attention_mask,
+            past_key_values,
+            new_input_embeds,
+            new_labels,
+            vision_tower_aux_feature_list_final,
+            vision_tower_aux_attention_masks_list_final,
+            final_size,
+            global_context_feature_final,
+        )
+    def initialize_vision_tokenizer(self, model_args, tokenizer):
+        if model_args.mm_use_im_patch_token:
+            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+        if model_args.mm_use_im_start_end:
+            num_new_tokens = tokenizer.add_tokens(
+                [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True
+            )
+            self.resize_token_embeddings(len(tokenizer))
+            if num_new_tokens > 0:
+                input_embeddings = self.get_input_embeddings().weight.data
+                output_embeddings = self.get_output_embeddings().weight.data
+                input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True
+                )
+                output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True
+                )
+                input_embeddings[-num_new_tokens:] = input_embeddings_avg
+                output_embeddings[-num_new_tokens:] = output_embeddings_avg
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = True
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False
+            if model_args.pretrain_mm_mlp_adapter:
+                mm_projector_weights = torch.load(
+                    model_args.pretrain_mm_mlp_adapter, map_location="cpu"
+                )
+                embed_tokens_weight = mm_projector_weights["model.embed_tokens.weight"]
+                assert num_new_tokens == 2
+                if input_embeddings.shape == embed_tokens_weight.shape:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight[
+                        -num_new_tokens:
+                    ]
+                elif embed_tokens_weight.shape[0] == num_new_tokens:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight
+                else:
+                    raise ValueError(
+                        f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}."
+                    )
+        elif model_args.mm_use_im_patch_token:
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = False
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False

config.json ADDED Viewed

	@@ -0,0 +1,97 @@

+{
+  "_name_or_path": "jadechoghari/LongVU_Llama3_2_3B_img",
+  "architectures": [
+    "CambrianLlamaForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "modeling.CambrianConfig",
+    "AutoModel": "modeling.CambrianLlamaForCausalLM",
+    "AutoModelForCausalLM": "modeling.CambrianLlamaForCausalLM"
+  },
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "connect_layer": 2,
+  "connector_depth": 3,
+  "connector_only": true,
+  "dino_threshold": 0.83,
+  "drop_threshold": 0.8,
+  "eos_token_id": [
+    128001,
+    128008,
+    128009
+  ],
+  "frame_pos": false,
+  "freeze_mm_mlp_adapter": false,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 3072,
+  "highres": false,
+  "highres_connect": false,
+  "image_aspect_ratio": "pad",
+  "image_position": 91,
+  "image_token_len": 576,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "is_image_newline": true,
+  "is_st_sampler": false,
+  "lowres_token": 8,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "mm_patch_merge_type": "flat",
+  "mm_projector_lr": null,
+  "mm_projector_type": "sva",
+  "mm_use_im_patch_token": false,
+  "mm_use_im_start_end": false,
+  "mm_vision_sampler_lr": null,
+  "mm_vision_select_feature": "patch",
+  "mm_vision_select_layer": -2,
+  "mm_vision_tower_aux_list": [
+    "siglip/CLIP-ViT-SO400M-14-384",
+    "facebook/dinov2-giant-res378"
+  ],
+  "mm_vision_tower_aux_token_len_list": [
+    576,
+    576
+  ],
+  "mm_vision_tower_lr": null,
+  "model_type": "cambrian_llama",
+  "num_attention_heads": 24,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 8,
+  "num_of_vision_sampler_layers": 10,
+  "num_query_group": 1,
+  "pretraining_tp": 1,
+  "query_num_list": [
+    576
+  ],
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 32.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "spmd_debug": null,
+  "spmd_fsdp_sharding": null,
+  "spmd_mesh": null,
+  "start_of_vision_sampler_layers": 0,
+  "stride_of_vision_sampler_layers": 3,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 8192,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float32",
+  "transformers_version": "4.44.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_mm_vision_tower": false,
+  "use_cache": false,
+  "use_mm_proj": true,
+  "vision_hidden_size": 1024,
+  "vision_tower_aux_token_len_list": [
+    576,
+    576
+  ],
+  "vocab_size": 128256
+}

modeling.py ADDED Viewed

	@@ -0,0 +1,546 @@

+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import CrossEntropyLoss
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    LlamaConfig,
+    LlamaForCausalLM,
+    LlamaModel,
+)
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.generation.utils import GenerateOutput
+from transformers.modeling_attn_mask_utils import (
+    _prepare_4d_causal_attention_mask,
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.utils import logging
+from cambrian_arch import CambrianMetaForCausalLM, CambrianMetaModel
+IS_XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)
+class CambrianConfig(LlamaConfig):
+    model_type = "cambrian_llama"
+    debug = "debug"
+class CambrianLlamaModel(CambrianMetaModel, LlamaModel):
+    config_class = CambrianConfig
+    def __init__(self, config: LlamaConfig):
+        super(CambrianLlamaModel, self).__init__(config)
+    def forward(
+        self,
+        # pyre-fixme[9]: input_ids has type `LongTensor`; used as `None`.
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        vision_tower_aux_feature_list: Optional[List[torch.FloatTensor]] = None,
+        vision_tower_aux_attention_masks_list: Optional[List[torch.Tensor]] = None,
+        final_vision_feature_size: Optional[List[tuple]] = None,
+        global_context_feature: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            # pyre-fixme[16]: `CambrianLlamaModel` has no attribute `config`.
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape[:2]
+        elif inputs_embeds is not None:
+            batch_size, seq_length = inputs_embeds.shape[:2]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        # pyre-fixme[16]: `CambrianLlamaModel` has no attribute
+        #  `gradient_checkpointing`.
+        # pyre-fixme[16]: `CambrianLlamaModel` has no attribute `training`.
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        past_key_values_length = 0
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                # pyre-fixme[9]: past_key_values has type
+                #  `Optional[List[FloatTensor]]`; used as `DynamicCache`.
+                # pyre-fixme[6]: For 1st argument expected
+                #  `Optional[Tuple[Tuple[FloatTensor]]]` but got
+                #  `Optional[List[FloatTensor]]`.
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            # pyre-fixme[16]: `Optional` has no attribute `get_usable_length`.
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+        if position_ids is None:
+            # pyre-fixme[16]: `Optional` has no attribute `device`.
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length,
+                seq_length + past_key_values_length,
+                dtype=torch.long,
+                device=device,
+            )
+            position_ids = position_ids.unsqueeze(0)
+        if inputs_embeds is None:
+            # pyre-fixme[16]: `CambrianLlamaModel` has no attribute `embed_tokens`.
+            inputs_embeds = self.embed_tokens(input_ids)
+        # pyre-fixme[16]: `CambrianLlamaModel` has no attribute
+        #  `_use_flash_attention_2`.
+        self._use_flash_attention_2 = getattr(self, "_use_flash_attention_2", False)
+        # pyre-fixme[16]: `CambrianLlamaModel` has no attribute `_use_sdpa`.
+        self._use_sdpa = getattr(self, "_use_sdpa", True)
+        if self._use_flash_attention_2:
+            # 2d mask is passed through the layers
+            attention_mask = (
+                attention_mask
+                if (attention_mask is not None and 0 in attention_mask)
+                else None
+            )
+        elif self._use_sdpa and not output_attentions:
+            # output_attentions=True can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        # embed positions
+        hidden_states = inputs_embeds
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        # pyre-fixme[16]: `CambrianLlamaModel` has no attribute `layers`.
+        for i, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                # pyre-fixme[16]: `CambrianLlamaModel` has no attribute
+                #  `_gradient_checkpointing_func`.
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        # pyre-fixme[16]: `CambrianLlamaModel` has no attribute `norm`.
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = None
+        if use_cache:
+            next_cache = (
+                next_decoder_cache.to_legacy_cache()
+                # pyre-fixme[61]: `use_legacy_cache` is undefined, or not always
+                #  defined.
+                if use_legacy_cache
+                else next_decoder_cache
+            )
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
+                if v is not None
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class CambrianLlamaForCausalLM(LlamaForCausalLM, CambrianMetaForCausalLM):
+    config_class = CambrianConfig
+    def __init__(self, config):
+        super(LlamaForCausalLM, self).__init__(config)
+        self.model = CambrianLlamaModel(config)
+        self.pretraining_tp = config.pretraining_tp
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_model(self):
+        return self.model
+    def forward(
+        self,
+        # pyre-fixme[9]: input_ids has type `LongTensor`; used as `None`.
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        image_aux_attention_masks_list: Optional[List[torch.Tensor]] = None,
+        image_sizes: Optional[List[List[int]]] = None,
+        return_dict: Optional[bool] = None,
+        cache_position=None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        final_vision_feature_size = None
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels,
+                vision_tower_aux_feature_list,
+                vision_tower_aux_attention_masks_list,
+                final_vision_feature_size,
+                global_context_feature,
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images,
+                image_aux_attention_masks_list,
+                image_sizes,
+            )
+        if IS_XLA_AVAILABLE:
+            # Very Important for TorchXLA
+            # self.model.gradient_checkpointing = False
+            # pyre-fixme[21]: Could not find module `torch_xla.utils.checkpoint`.
+            from torch_xla.utils.checkpoint import checkpoint
+            # self.model.gradient_checkpointing = True
+            # pyre-fixme[16]: `CambrianLlamaModel` has no attribute
+            #  `_gradient_checkpointing_func`.
+            self.model._gradient_checkpointing_func = checkpoint
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            # pyre-fixme[16]: `CambrianLlamaForCausalLM` has no attribute `config`.
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        # training
+        if IS_XLA_AVAILABLE:
+            # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+            # pyre-fixme[29]: `CambrianLlamaModel` is not a function.
+            outputs = self.model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                # pyre-fixme[61]: `vision_tower_aux_feature_list` is undefined, or
+                #  not always defined.
+                vision_tower_aux_feature_list=vision_tower_aux_feature_list,
+                # pyre-fixme[61]: `vision_tower_aux_attention_masks_list` is
+                #  undefined, or not always defined.
+                vision_tower_aux_attention_masks_list=vision_tower_aux_attention_masks_list,
+                final_vision_feature_size=final_vision_feature_size,
+                # pyre-fixme[61]: `global_context_feature` is undefined, or not
+                #  always defined.
+                global_context_feature=global_context_feature,
+            )
+        # inference
+        else:
+            if hasattr(self, "vision_tower_aux_feature_list"):
+                # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+                # pyre-fixme[29]: `CambrianLlamaModel` is not a function.
+                outputs = self.model(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_values=past_key_values,
+                    inputs_embeds=inputs_embeds,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                    output_hidden_states=output_hidden_states,
+                    return_dict=return_dict,
+                    vision_tower_aux_feature_list=(
+                        # pyre-fixme[61]: `vision_tower_aux_feature_list` is
+                        #  undefined, or not always defined.
+                        vision_tower_aux_feature_list
+                        if inputs_embeds is None
+                        # pyre-fixme[16]: `CambrianLlamaForCausalLM` has no
+                        #  attribute `vision_tower_aux_feature_list`.
+                        else self.vision_tower_aux_feature_list
+                    ),
+                    vision_tower_aux_attention_masks_list=(
+                        # pyre-fixme[61]: `vision_tower_aux_attention_masks_list` is
+                        #  undefined, or not always defined.
+                        vision_tower_aux_attention_masks_list
+                        if inputs_embeds is None
+                        # pyre-fixme[16]: `CambrianLlamaForCausalLM` has no
+                        #  attribute `vision_tower_aux_attention_masks_list`.
+                        else self.vision_tower_aux_attention_masks_list
+                    ),
+                    final_vision_feature_size=(
+                        final_vision_feature_size
+                        if inputs_embeds is None
+                        # pyre-fixme[16]: `CambrianLlamaForCausalLM` has no
+                        #  attribute `final_vision_feature_size`.
+                        else self.final_vision_feature_size
+                    ),
+                    global_context_feature=(
+                        # pyre-fixme[61]: `global_context_feature` is undefined, or
+                        #  not always defined.
+                        global_context_feature
+                        if inputs_embeds is None
+                        # pyre-fixme[16]: `CambrianLlamaForCausalLM` has no
+                        #  attribute `global_context_feature`.
+                        else self.global_context_feature
+                    ),
+                )
+            else:
+                # pyre-fixme[29]: `CambrianLlamaModel` is not a function.
+                outputs = self.model(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_values=past_key_values,
+                    inputs_embeds=inputs_embeds,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                    output_hidden_states=output_hidden_states,
+                    return_dict=return_dict,
+                    # final_vision_feature_size=final_vision_feature_size,
+                )
+        hidden_states = outputs[0]
+        if self.config.pretraining_tp > 1:
+            lm_head_slices = self.lm_head.weight.split(
+                self.vocab_size // self.config.pretraining_tp, dim=0
+            )
+            logits = [
+                F.linear(hidden_states, lm_head_slices[i])
+                for i in range(self.config.pretraining_tp)
+            ]
+            logits = torch.cat(logits, dim=-1)
+        else:
+            logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+        if images is not None:
+            (
+                inputs,
+                position_ids,
+                attention_mask,
+                _,
+                inputs_embeds,
+                _,
+                vision_tower_aux_feature_list,
+                vision_tower_aux_attention_masks_list,
+                final_vision_feature_size,
+                global_context_feature,
+            ) = self.prepare_inputs_labels_for_multimodal(
+                inputs,
+                position_ids,
+                attention_mask,
+                None,
+                None,
+                images,
+                image_sizes=image_sizes,
+            )
+            # pyre-fixme[16]: `CambrianLlamaForCausalLM` has no attribute
+            #  `vision_tower_aux_feature_list`.
+            self.vision_tower_aux_feature_list = vision_tower_aux_feature_list
+            # pyre-fixme[16]: `CambrianLlamaForCausalLM` has no attribute
+            #  `vision_tower_aux_attention_masks_list`.
+            self.vision_tower_aux_attention_masks_list = (
+                vision_tower_aux_attention_masks_list
+            )
+            # pyre-fixme[16]: `CambrianLlamaForCausalLM` has no attribute
+            #  `final_vision_feature_size`.
+            self.final_vision_feature_size = final_vision_feature_size
+            # pyre-fixme[16]: `CambrianLlamaForCausalLM` has no attribute
+            #  `global_context_feature`.
+            self.global_context_feature = global_context_feature
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+        # pyre-fixme[16]: `LlamaForCausalLM` has no attribute `generate`.
+        return super().generate(
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs,
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs
+    ):
+        images = kwargs.pop("images", None)
+        image_sizes = kwargs.pop("image_sizes", None)
+        inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            **kwargs,
+        )
+        if images is not None:
+            inputs["images"] = images
+        if image_sizes is not None:
+            inputs["image_sizes"] = image_sizes
+        return inputs
+AutoConfig.register("cambrian_llama", CambrianConfig)
+AutoModelForCausalLM.register(CambrianConfig, CambrianLlamaForCausalLM)

multimodal_encoder_builder.py ADDED Viewed

	@@ -0,0 +1,368 @@

+# pyre-unsafe
+import copy
+import torch
+import torch.nn.functional as F
+from transformers import AutoImageProcessor, Dinov2Config, Dinov2Model, SiglipImageProcessor, SiglipVisionConfig, SiglipVisionModel
+from abc import ABC, abstractmethod
+import torch.nn as nn
+class ProcessorWrapper:
+    def __init__(
+        self,
+        transform,
+        height=378,
+        width=378,
+        image_mean=[0.48145466, 0.4578275, 0.40821073],
+    ):
+        self._crop_size = {
+            "height": height,
+            "width": width,
+        }
+        self._transforms = transform
+        # print(transform)
+        self.image_mean = image_mean
+    @property
+    def crop_size(self):
+        return self._crop_size
+    def preprocess(self, image, return_tensors="pt"):
+        # Ensure image is a PIL Image
+        output = {}
+        output["pixel_values"] = [self._transforms(image)]
+        return output
+class BaseVisionTower(nn.Module):
+    def __init__(self, vision_tower_name, args, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.args = args
+        self.vision_tower_name = vision_tower_name
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, "mm_vision_select_feature", "patch")
+        self.unfreeze_mm_vision_tower = getattr(args, "unfreeze_mm_vision_tower", False)
+        self.delay_load = delay_load
+    @abstractmethod
+    def load_model(self, device_map=None):
+        raise NotImplementedError("Subclasses must implement load_model")
+    @abstractmethod
+    def _forward(self, images):
+        raise NotImplementedError("Subclasses must implement forward")
+    def forward(self, images):
+        if type(images) is list:
+            image_features = [self._forward(image.unsqueeze(0)) for image in images]
+        else:
+            image_features = self._forward(images)
+        return image_features
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        # Dynamically infer the dtype from the first parameter, if not explicitly specified
+        if hasattr(self.vision_tower, "dtype"):
+            return self.vision_tower.dtype
+        else:
+            params = list(self.vision_tower.parameters())
+            return (
+                params[0].dtype if len(params) > 0 else torch.float32
+            )  # Default to torch.float32 if no parameters
+    @property
+    def device(self):
+        # Dynamically infer the device from the first parameter, if not explicitly specified
+        if hasattr(self.vision_tower, "device"):
+            return self.vision_tower.device
+        else:
+            params = list(self.vision_tower.parameters())
+            return (
+                params[0].device if len(params) > 0 else torch.device("cpu")
+            )  # Default to CPU if no parameters
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+    @property
+    def hidden_size(self):
+        try:
+            return self.config.hidden_size
+        except:
+            return self._hidden_size
+    @property
+    def image_size(self):  # resolution
+        # return self.config.image_size
+        try:
+            return self.config.image_size
+        except:
+            return self._image_size
+    @property
+    def patch_size(self):
+        # return self.config.patch_size
+        try:
+            return self.config.patch_size
+        except:
+            return self._patch_size
+    @property
+    def num_patches_per_side(self):
+        if self._interp_size is not None:
+            return int(self._interp_size**0.5)
+        try:
+            return self.image_size // self.patch_size
+        except:
+            return self._num_patches_per_side
+    @property
+    def num_patches(self):
+        if self._interp_size is not None:
+            return self._interp_size
+        try:
+            return self.num_patches_per_side**2
+        except:
+            return self._num_patches
+class DinoVisionTower(BaseVisionTower):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super(DinoVisionTower, self).__init__(vision_tower, args, delay_load)
+        model_path = "facebook/dinov2-giant"
+        base_model_name, res, interp = model_path, 378, 576
+        self._vision_tower_name = vision_tower
+        self.vision_tower_name = base_model_name
+        self._image_size = res
+        self._interp_size = interp
+        self._patch_size = 14  # default patch size
+        if not self.delay_load:
+            self.load_model()
+        else:
+            self.cfg_only = Dinov2Config.from_pretrained(self.vision_tower_name)
+    def load_model(self, device_map=None):
+        self.vision_tower = Dinov2Model.from_pretrained(self.vision_tower_name)
+        """ValueError: Dinov2Model does not support `device_map='auto'`. To implement support, the model class needs to implement the `_no_split_modules` attribute."""
+        self.vision_tower._no_split_modules = ["Dinov2SwiGLUFFN"]
+        _image_size = self.vision_tower.config.image_size
+        if self._image_size is None:
+            self._image_size = _image_size
+        # increase shortest edge to prevent edge case crops
+        default_shortest_ratio = 8 / 7  # 224/256
+        # shortest_edge = int(default_shortest_ratio * self._image_size)
+        shortest_edge = self._image_size
+        processor = AutoImageProcessor.from_pretrained(
+            self.vision_tower_name,
+            crop_size=dict(height=self._image_size, width=self._image_size),
+            size=dict(shortest_edge=shortest_edge),
+        )
+        self.image_processor = processor
+        # Assign the output channels of the projection convolution as the hidden size
+        self._hidden_size = (
+            self.vision_tower.embeddings.patch_embeddings.projection.out_channels
+        )
+        # Assign the first value of the stride of the projection convolution as the patch size
+        self._patch_size = (
+            self.vision_tower.embeddings.patch_embeddings.projection.stride[0]
+        )
+        # print(self._hidden_size, self._patch_size)
+        self.vision_tower.requires_grad_(self.unfreeze_mm_vision_tower)
+        self.is_loaded = True
+    @property
+    def image_size(self):
+        return self._image_size
+    def feature_select(self, outputs):
+        sequence_output = outputs[
+            "last_hidden_state"
+        ]  # batch_size, sequence_length, hidden_size
+        if self.select_feature == "cls_patch":
+            image_features = sequence_output
+        elif self.select_feature == "patch":
+            image_features = sequence_output[:, 1:]
+        elif self.select_feature == "cls":
+            image_features = sequence_output[:, 0]
+        else:
+            raise ValueError(f"Unexpected select feature: {self.select_feature}")
+        return image_features
+    def interpolate(self, image_features):
+        if self._interp_size is None:
+            return image_features
+        b, num_tokens, dim = image_features.shape
+        if num_tokens != self.num_patches:
+            target_h = target_w = int(self._interp_size**0.5)
+            h = w = int(num_tokens**0.5)
+            image_features = image_features.view(b, h, w, dim)
+            image_features = image_features.permute(0, 3, 1, 2).contiguous()
+            image_features = F.interpolate(
+                image_features.to(torch.float32),
+                size=(target_h, target_w),
+                mode="bilinear",
+                align_corners=False,
+            ).to(image_features.dtype)
+            # Permute the dimensions back to (b, target_h, target_w, dim)
+            image_features = image_features.permute(0, 2, 3, 1).contiguous()
+            # Flatten the spatial dimensions (target_h, target_w) into a single dimension
+            image_features = image_features.flatten(1, 2)
+        return image_features
+    def _forward(self, images):
+        # logger.warning(f"images shape: {images.shape}")
+        with torch.set_grad_enabled(self.unfreeze_mm_vision_tower):
+            image_forward_outs = self.vision_tower.forward(
+                images.to(device=self.device, dtype=self.dtype)
+            )
+            # logger.warning(f"image_forward_outs shape: {image_forward_outs['last_hidden_state'].shape}")
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+            # logger.warning(f"image_features shape: {image_features.shape}")
+            interp_features = self.interpolate(image_features)
+            # logger.warning(f"interp_features shape: {interp_features.shape}")
+            return interp_features
+    @property
+    def num_patches_per_side(self):
+        return int(self.num_patches**0.5)
+    @property
+    def num_patches(self):
+        if self._interp_size is None:
+            return (self._image_size // self._patch_size) ** 2
+        else:
+            return self._interp_size
+# from .siglip_encoder import SiglipVisionTower
+class SiglipVisionTower(BaseVisionTower):
+    def __init__(self, vision_tower_name, args, delay_load=False):
+        super(SiglipVisionTower, self).__init__(vision_tower_name, args, delay_load)
+        model_path = "google/siglip-so400m-patch14-384"
+        base_model_name, res, interp = model_path, 384, 576
+        self.vision_tower_name = base_model_name
+        self._image_size = res if res is not None else 512
+        self._interp_size = interp
+        if not self.delay_load:
+            self.load_model()
+        elif self.unfreeze_mm_vision_tower:
+            self.load_model()
+        else:
+            self._hidden_size = 1152
+    def load_model(self, device_map=None):
+        self.vision_model = "siglip"
+        # clip_model, processor = create_model_from_pretrained(self.vision_tower_name)
+        self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name)
+        # self.vision_tower = clip_model.visual.trunk
+        self.vision_tower.output_tokens = True
+        self._hidden_size = self.vision_tower.config.hidden_size
+        self._image_size = self.vision_tower.config.image_size
+        self._patch_size = self.vision_tower.config.patch_size
+        self.image_processor = SiglipImageProcessor.from_pretrained(
+            self.vision_tower_name
+        )
+        self.vision_tower.requires_grad_(self.unfreeze_mm_vision_tower)
+        self.is_loaded = True
+    def interpolate(self, image_features):
+        if self._interp_size is None:
+            return image_features
+        b, num_tokens, dim = image_features.shape
+        if num_tokens != self.num_patches:
+            target_h = target_w = int(self._interp_size**0.5)
+            h = w = int(num_tokens**0.5)
+            image_features = image_features.view(b, h, w, dim)
+            image_features = image_features.permute(0, 3, 1, 2).contiguous()
+            image_features = F.interpolate(
+                image_features.to(torch.float32),
+                size=(target_h, target_w),
+                mode="bilinear",
+                align_corners=False,
+            ).to(image_features.dtype)
+            # Permute the dimensions back to (b, target_h, target_w, dim)
+            image_features = image_features.permute(0, 2, 3, 1).contiguous()
+            # Flatten the spatial dimensions (target_h, target_w) into a single dimension
+            image_features = image_features.flatten(1, 2)
+        return image_features
+    def _forward(self, images, interpolate_token=576):
+        with torch.set_grad_enabled(self.unfreeze_mm_vision_tower):
+            image_features = self.vision_tower.forward(
+                images.to(device=self.device, dtype=self.dtype),
+                output_hidden_states=True,
+            ).hidden_states[-1]
+            interp_features = self.interpolate(image_features)
+            return interp_features
+def build_vision_tower_aux_list(vision_tower_cfg, **kwargs):
+    vision_tower_aux_name_list = getattr(
+        vision_tower_cfg,
+        "mm_vision_tower_aux_list",
+        getattr(vision_tower_cfg, "vision_tower_aux_list", None),
+    )
+    vision_tower_aux_token_len_list = getattr(
+        vision_tower_cfg,
+        "mm_vision_tower_aux_token_len_list",
+        getattr(vision_tower_cfg, "vision_tower_aux_token_len_list", None),
+    )
+    vision_tower_aux_list = []
+    for vision_tower_aux_name, vision_tower_aux_token_len in zip(
+        vision_tower_aux_name_list, vision_tower_aux_token_len_list
+    ):
+        config = copy.deepcopy(vision_tower_cfg)
+        vision_tower_aux_name += "-interp{}".format(vision_tower_aux_token_len)
+        if "siglip" in vision_tower_aux_name.lower():
+            vision_tower_aux_list.append(
+                SiglipVisionTower(vision_tower_aux_name, args=config, **kwargs)
+            )
+        # SSL-based Vision Towers
+        elif "dinov2" in vision_tower_aux_name.lower():
+            vision_tower_aux_list.append(
+                DinoVisionTower(vision_tower_aux_name, args=config, **kwargs)
+            )
+        else:
+            raise ValueError(f"Unknown vision tower: {vision_tower_aux_name}")
+    return vision_tower_aux_list

multimodal_projector_builder.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# pyre-unsafe
+import re
+import torch.nn as nn
+class IdentityMap(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x, *args, **kwargs):
+        return x
+    @property
+    def config(self):
+        return {"mm_projector_type": "identity"}
+class SimpleResBlock(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(channels)
+        self.proj = nn.Sequential(
+            nn.Linear(channels, channels), nn.GELU(), nn.Linear(channels, channels)
+        )
+    def forward(self, x):
+        x = self.pre_norm(x)
+        return x + self.proj(x)
+def build_vision_projector(config, delay_load=False, **kwargs):
+    projector_type = getattr(config, "mm_projector_type", "linear")
+    config.mm_hidden_size = 256
+    if projector_type == "linear":
+        return nn.Linear(config.mm_hidden_size, config.hidden_size)
+    mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", projector_type)
+    if mlp_gelu_match:
+        mlp_depth = int(mlp_gelu_match.group(1))
+        modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
+        for _ in range(1, mlp_depth):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+        return nn.Sequential(*modules)
+    if projector_type == "identity":
+        return IdentityMap()
+    raise ValueError(f"Unknown projector type: {projector_type}")

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b13794197529249a963b1b4332d69eeb251593680123f392094abe75261e234
+size 7317846330

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "bos_token": {
+    "content": "<|begin_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|eot_id|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,2062 @@

+{
+  "added_tokens_decoder": {
+    "128000": {
+      "content": "<|begin_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128001": {
+      "content": "<|end_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128002": {
+      "content": "<|reserved_special_token_0|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128003": {
+      "content": "<|reserved_special_token_1|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128004": {
+      "content": "<|finetune_right_pad_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128005": {
+      "content": "<|reserved_special_token_2|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128006": {
+      "content": "<|start_header_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128007": {
+      "content": "<|end_header_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128008": {
+      "content": "<|eom_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128009": {
+      "content": "<|eot_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128010": {
+      "content": "<|python_tag|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128011": {
+      "content": "<|reserved_special_token_3|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128012": {
+      "content": "<|reserved_special_token_4|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128013": {
+      "content": "<|reserved_special_token_5|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128014": {
+      "content": "<|reserved_special_token_6|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128015": {
+      "content": "<|reserved_special_token_7|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128016": {
+      "content": "<|reserved_special_token_8|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128017": {
+      "content": "<|reserved_special_token_9|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128018": {
+      "content": "<|reserved_special_token_10|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128019": {
+      "content": "<|reserved_special_token_11|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128020": {
+      "content": "<|reserved_special_token_12|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128021": {
+      "content": "<|reserved_special_token_13|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128022": {
+      "content": "<|reserved_special_token_14|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128023": {
+      "content": "<|reserved_special_token_15|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128024": {
+      "content": "<|reserved_special_token_16|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128025": {
+      "content": "<|reserved_special_token_17|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128026": {
+      "content": "<|reserved_special_token_18|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128027": {
+      "content": "<|reserved_special_token_19|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128028": {
+      "content": "<|reserved_special_token_20|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128029": {
+      "content": "<|reserved_special_token_21|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128030": {
+      "content": "<|reserved_special_token_22|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128031": {
+      "content": "<|reserved_special_token_23|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128032": {
+      "content": "<|reserved_special_token_24|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128033": {
+      "content": "<|reserved_special_token_25|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128034": {
+      "content": "<|reserved_special_token_26|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128035": {
+      "content": "<|reserved_special_token_27|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128036": {
+      "content": "<|reserved_special_token_28|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128037": {
+      "content": "<|reserved_special_token_29|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128038": {
+      "content": "<|reserved_special_token_30|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128039": {
+      "content": "<|reserved_special_token_31|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128040": {
+      "content": "<|reserved_special_token_32|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128041": {
+      "content": "<|reserved_special_token_33|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128042": {
+      "content": "<|reserved_special_token_34|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128043": {
+      "content": "<|reserved_special_token_35|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128044": {
+      "content": "<|reserved_special_token_36|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128045": {
+      "content": "<|reserved_special_token_37|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128046": {
+      "content": "<|reserved_special_token_38|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128047": {
+      "content": "<|reserved_special_token_39|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128048": {
+      "content": "<|reserved_special_token_40|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128049": {
+      "content": "<|reserved_special_token_41|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128050": {
+      "content": "<|reserved_special_token_42|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128051": {
+      "content": "<|reserved_special_token_43|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128052": {
+      "content": "<|reserved_special_token_44|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128053": {
+      "content": "<|reserved_special_token_45|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128054": {
+      "content": "<|reserved_special_token_46|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128055": {
+      "content": "<|reserved_special_token_47|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128056": {
+      "content": "<|reserved_special_token_48|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128057": {
+      "content": "<|reserved_special_token_49|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128058": {
+      "content": "<|reserved_special_token_50|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128059": {
+      "content": "<|reserved_special_token_51|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128060": {
+      "content": "<|reserved_special_token_52|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128061": {
+      "content": "<|reserved_special_token_53|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128062": {
+      "content": "<|reserved_special_token_54|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128063": {
+      "content": "<|reserved_special_token_55|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128064": {
+      "content": "<|reserved_special_token_56|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128065": {
+      "content": "<|reserved_special_token_57|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128066": {
+      "content": "<|reserved_special_token_58|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128067": {
+      "content": "<|reserved_special_token_59|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128068": {
+      "content": "<|reserved_special_token_60|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128069": {
+      "content": "<|reserved_special_token_61|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128070": {
+      "content": "<|reserved_special_token_62|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128071": {
+      "content": "<|reserved_special_token_63|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128072": {
+      "content": "<|reserved_special_token_64|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128073": {
+      "content": "<|reserved_special_token_65|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128074": {
+      "content": "<|reserved_special_token_66|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128075": {
+      "content": "<|reserved_special_token_67|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128076": {
+      "content": "<|reserved_special_token_68|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128077": {
+      "content": "<|reserved_special_token_69|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128078": {
+      "content": "<|reserved_special_token_70|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128079": {
+      "content": "<|reserved_special_token_71|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128080": {
+      "content": "<|reserved_special_token_72|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128081": {
+      "content": "<|reserved_special_token_73|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128082": {
+      "content": "<|reserved_special_token_74|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128083": {
+      "content": "<|reserved_special_token_75|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128084": {
+      "content": "<|reserved_special_token_76|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128085": {
+      "content": "<|reserved_special_token_77|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128086": {
+      "content": "<|reserved_special_token_78|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128087": {
+      "content": "<|reserved_special_token_79|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128088": {
+      "content": "<|reserved_special_token_80|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128089": {
+      "content": "<|reserved_special_token_81|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128090": {
+      "content": "<|reserved_special_token_82|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128091": {
+      "content": "<|reserved_special_token_83|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128092": {
+      "content": "<|reserved_special_token_84|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128093": {
+      "content": "<|reserved_special_token_85|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128094": {
+      "content": "<|reserved_special_token_86|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128095": {
+      "content": "<|reserved_special_token_87|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128096": {
+      "content": "<|reserved_special_token_88|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128097": {
+      "content": "<|reserved_special_token_89|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128098": {
+      "content": "<|reserved_special_token_90|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128099": {
+      "content": "<|reserved_special_token_91|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128100": {
+      "content": "<|reserved_special_token_92|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128101": {
+      "content": "<|reserved_special_token_93|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128102": {
+      "content": "<|reserved_special_token_94|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128103": {
+      "content": "<|reserved_special_token_95|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128104": {
+      "content": "<|reserved_special_token_96|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128105": {
+      "content": "<|reserved_special_token_97|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128106": {
+      "content": "<|reserved_special_token_98|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128107": {
+      "content": "<|reserved_special_token_99|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128108": {
+      "content": "<|reserved_special_token_100|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128109": {
+      "content": "<|reserved_special_token_101|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128110": {
+      "content": "<|reserved_special_token_102|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128111": {
+      "content": "<|reserved_special_token_103|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128112": {
+      "content": "<|reserved_special_token_104|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128113": {
+      "content": "<|reserved_special_token_105|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128114": {
+      "content": "<|reserved_special_token_106|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128115": {
+      "content": "<|reserved_special_token_107|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128116": {
+      "content": "<|reserved_special_token_108|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128117": {
+      "content": "<|reserved_special_token_109|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128118": {
+      "content": "<|reserved_special_token_110|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128119": {
+      "content": "<|reserved_special_token_111|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128120": {
+      "content": "<|reserved_special_token_112|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128121": {
+      "content": "<|reserved_special_token_113|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128122": {
+      "content": "<|reserved_special_token_114|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128123": {
+      "content": "<|reserved_special_token_115|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128124": {
+      "content": "<|reserved_special_token_116|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128125": {
+      "content": "<|reserved_special_token_117|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128126": {
+      "content": "<|reserved_special_token_118|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128127": {
+      "content": "<|reserved_special_token_119|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128128": {
+      "content": "<|reserved_special_token_120|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128129": {
+      "content": "<|reserved_special_token_121|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128130": {
+      "content": "<|reserved_special_token_122|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128131": {
+      "content": "<|reserved_special_token_123|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128132": {
+      "content": "<|reserved_special_token_124|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128133": {
+      "content": "<|reserved_special_token_125|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128134": {
+      "content": "<|reserved_special_token_126|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128135": {
+      "content": "<|reserved_special_token_127|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128136": {
+      "content": "<|reserved_special_token_128|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128137": {
+      "content": "<|reserved_special_token_129|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128138": {
+      "content": "<|reserved_special_token_130|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128139": {
+      "content": "<|reserved_special_token_131|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128140": {
+      "content": "<|reserved_special_token_132|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128141": {
+      "content": "<|reserved_special_token_133|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128142": {
+      "content": "<|reserved_special_token_134|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128143": {
+      "content": "<|reserved_special_token_135|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128144": {
+      "content": "<|reserved_special_token_136|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128145": {
+      "content": "<|reserved_special_token_137|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128146": {
+      "content": "<|reserved_special_token_138|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128147": {
+      "content": "<|reserved_special_token_139|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128148": {
+      "content": "<|reserved_special_token_140|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128149": {
+      "content": "<|reserved_special_token_141|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128150": {
+      "content": "<|reserved_special_token_142|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128151": {
+      "content": "<|reserved_special_token_143|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128152": {
+      "content": "<|reserved_special_token_144|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128153": {
+      "content": "<|reserved_special_token_145|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128154": {
+      "content": "<|reserved_special_token_146|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128155": {
+      "content": "<|reserved_special_token_147|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128156": {
+      "content": "<|reserved_special_token_148|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128157": {
+      "content": "<|reserved_special_token_149|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128158": {
+      "content": "<|reserved_special_token_150|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128159": {
+      "content": "<|reserved_special_token_151|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128160": {
+      "content": "<|reserved_special_token_152|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128161": {
+      "content": "<|reserved_special_token_153|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128162": {
+      "content": "<|reserved_special_token_154|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128163": {
+      "content": "<|reserved_special_token_155|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128164": {
+      "content": "<|reserved_special_token_156|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128165": {
+      "content": "<|reserved_special_token_157|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128166": {
+      "content": "<|reserved_special_token_158|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128167": {
+      "content": "<|reserved_special_token_159|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128168": {
+      "content": "<|reserved_special_token_160|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128169": {
+      "content": "<|reserved_special_token_161|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128170": {
+      "content": "<|reserved_special_token_162|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128171": {
+      "content": "<|reserved_special_token_163|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128172": {
+      "content": "<|reserved_special_token_164|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128173": {
+      "content": "<|reserved_special_token_165|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128174": {
+      "content": "<|reserved_special_token_166|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128175": {
+      "content": "<|reserved_special_token_167|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128176": {
+      "content": "<|reserved_special_token_168|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128177": {
+      "content": "<|reserved_special_token_169|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128178": {
+      "content": "<|reserved_special_token_170|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128179": {
+      "content": "<|reserved_special_token_171|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128180": {
+      "content": "<|reserved_special_token_172|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128181": {
+      "content": "<|reserved_special_token_173|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128182": {
+      "content": "<|reserved_special_token_174|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128183": {
+      "content": "<|reserved_special_token_175|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128184": {
+      "content": "<|reserved_special_token_176|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128185": {
+      "content": "<|reserved_special_token_177|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128186": {
+      "content": "<|reserved_special_token_178|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128187": {
+      "content": "<|reserved_special_token_179|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128188": {
+      "content": "<|reserved_special_token_180|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128189": {
+      "content": "<|reserved_special_token_181|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128190": {
+      "content": "<|reserved_special_token_182|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128191": {
+      "content": "<|reserved_special_token_183|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128192": {
+      "content": "<|reserved_special_token_184|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128193": {
+      "content": "<|reserved_special_token_185|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128194": {
+      "content": "<|reserved_special_token_186|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128195": {
+      "content": "<|reserved_special_token_187|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128196": {
+      "content": "<|reserved_special_token_188|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128197": {
+      "content": "<|reserved_special_token_189|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128198": {
+      "content": "<|reserved_special_token_190|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128199": {
+      "content": "<|reserved_special_token_191|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128200": {
+      "content": "<|reserved_special_token_192|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128201": {
+      "content": "<|reserved_special_token_193|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128202": {
+      "content": "<|reserved_special_token_194|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128203": {
+      "content": "<|reserved_special_token_195|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128204": {
+      "content": "<|reserved_special_token_196|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128205": {
+      "content": "<|reserved_special_token_197|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128206": {
+      "content": "<|reserved_special_token_198|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128207": {
+      "content": "<|reserved_special_token_199|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128208": {
+      "content": "<|reserved_special_token_200|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128209": {
+      "content": "<|reserved_special_token_201|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128210": {
+      "content": "<|reserved_special_token_202|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128211": {
+      "content": "<|reserved_special_token_203|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128212": {
+      "content": "<|reserved_special_token_204|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128213": {
+      "content": "<|reserved_special_token_205|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128214": {
+      "content": "<|reserved_special_token_206|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128215": {
+      "content": "<|reserved_special_token_207|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128216": {
+      "content": "<|reserved_special_token_208|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128217": {
+      "content": "<|reserved_special_token_209|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128218": {
+      "content": "<|reserved_special_token_210|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128219": {
+      "content": "<|reserved_special_token_211|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128220": {
+      "content": "<|reserved_special_token_212|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128221": {
+      "content": "<|reserved_special_token_213|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128222": {
+      "content": "<|reserved_special_token_214|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128223": {
+      "content": "<|reserved_special_token_215|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128224": {
+      "content": "<|reserved_special_token_216|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128225": {
+      "content": "<|reserved_special_token_217|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128226": {
+      "content": "<|reserved_special_token_218|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128227": {
+      "content": "<|reserved_special_token_219|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128228": {
+      "content": "<|reserved_special_token_220|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128229": {
+      "content": "<|reserved_special_token_221|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128230": {
+      "content": "<|reserved_special_token_222|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128231": {
+      "content": "<|reserved_special_token_223|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128232": {
+      "content": "<|reserved_special_token_224|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128233": {
+      "content": "<|reserved_special_token_225|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128234": {
+      "content": "<|reserved_special_token_226|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128235": {
+      "content": "<|reserved_special_token_227|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128236": {
+      "content": "<|reserved_special_token_228|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128237": {
+      "content": "<|reserved_special_token_229|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128238": {
+      "content": "<|reserved_special_token_230|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128239": {
+      "content": "<|reserved_special_token_231|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128240": {
+      "content": "<|reserved_special_token_232|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128241": {
+      "content": "<|reserved_special_token_233|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128242": {
+      "content": "<|reserved_special_token_234|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128243": {
+      "content": "<|reserved_special_token_235|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128244": {
+      "content": "<|reserved_special_token_236|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128245": {
+      "content": "<|reserved_special_token_237|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128246": {
+      "content": "<|reserved_special_token_238|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128247": {
+      "content": "<|reserved_special_token_239|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128248": {
+      "content": "<|reserved_special_token_240|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128249": {
+      "content": "<|reserved_special_token_241|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128250": {
+      "content": "<|reserved_special_token_242|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128251": {
+      "content": "<|reserved_special_token_243|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128252": {
+      "content": "<|reserved_special_token_244|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128253": {
+      "content": "<|reserved_special_token_245|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128254": {
+      "content": "<|reserved_special_token_246|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128255": {
+      "content": "<|reserved_special_token_247|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|begin_of_text|>",
+  "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|eot_id|>",
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 131072,
+  "tokenizer_class": "PreTrainedTokenizerFast"
+}

vision_sampler.py ADDED Viewed

	@@ -0,0 +1,566 @@

+import math
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from torch import nn
+# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+class CrossAttention(nn.Module):
+    def __init__(self, q_dim, kv_dim, hidden_dim, num_heads, attention_bias=False):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.num_heads = num_heads
+        self.head_dim = self.hidden_dim // self.num_heads
+        if (self.head_dim * self.num_heads) != self.hidden_dim:
+            raise ValueError(
+                f"hidden_dim must be divisible by num_heads (got `hidden_dim`: {self.hidden_dim}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Sequential(
+            nn.LayerNorm(q_dim),
+            nn.Linear(q_dim, self.num_heads * self.head_dim, bias=attention_bias),
+        )
+        self.k_proj = nn.Sequential(
+            nn.LayerNorm(kv_dim),
+            nn.Linear(kv_dim, self.num_heads * self.head_dim, bias=attention_bias),
+        )
+        self.v_proj = nn.Sequential(
+            nn.LayerNorm(kv_dim),
+            nn.Linear(kv_dim, self.num_heads * self.head_dim, bias=attention_bias),
+        )
+        self.o_proj = nn.Linear(
+            self.num_heads * self.head_dim, q_dim, bias=attention_bias
+        )
+    def forward(self, vision_latents, queries, attention_mask):
+        bsz, q_len, _ = queries.size()
+        bsz, v_len, _ = vision_latents.size()
+        query_states = self.q_proj(queries)
+        key_states = self.k_proj(vision_latents)
+        value_states = self.v_proj(vision_latents)
+        query_states = query_states.view(
+            bsz, q_len, self.num_heads, self.head_dim
+        ).transpose(1, 2)
+        key_states = key_states.view(
+            bsz, v_len, self.num_heads, self.head_dim
+        ).transpose(1, 2)
+        value_states = value_states.view(
+            bsz, v_len, self.num_heads, self.head_dim
+        ).transpose(1, 2)
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, v_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, v_len)}, but is {attention_mask.size()}"
+                )
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_dim)
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+class AggregationBlock(nn.Module):
+    def __init__(
+        self, attention, q_dim, kv_dim, hidden_dim, num_heads, attention_bias=False
+    ):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.num_heads = num_heads
+        self.head_dim = self.hidden_dim // self.num_heads
+        if (self.head_dim * self.num_heads) != self.hidden_dim:
+            raise ValueError(
+                f"hidden_dim must be divisible by num_heads (got `hidden_dim`: {self.hidden_dim}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.attention = attention
+        if attention:
+            self.attention_layer = CrossAttention(
+                q_dim, kv_dim, hidden_dim, num_heads, attention_bias
+            )
+        else:
+            self.attention_layer = MLP(kv_dim, q_dim, q_dim)
+    def forward(self, vision_latents, queries, attention_mask):
+        if self.attention:
+            queries = self.attention_layer(vision_latents, queries, attention_mask)
+        else:
+            queries = self.attention_layer(vision_latents)
+        return queries
+class MultiKVCrossAttention(nn.Module):
+    def __init__(self, q_dim, kv_dim_list, hidden_dim, num_heads, attention_bias=False):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.num_heads = num_heads
+        self.head_dim = self.hidden_dim // self.num_heads
+        if (self.head_dim * self.num_heads) != self.hidden_dim:
+            raise ValueError(
+                f"hidden_dim must be divisible by num_heads (got `hidden_dim`: {self.hidden_dim}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Sequential(
+            nn.LayerNorm(q_dim),
+            nn.Linear(q_dim, self.num_heads * self.head_dim, bias=attention_bias),
+        )
+        self.num_of_kvs = len(kv_dim_list)
+        for i, kv_dim in enumerate(kv_dim_list):
+            setattr(
+                self,
+                "k_proj_{}".format(i),
+                nn.Sequential(
+                    nn.LayerNorm(kv_dim),
+                    nn.Linear(
+                        kv_dim, self.num_heads * self.head_dim, bias=attention_bias
+                    ),
+                ),
+            )
+            setattr(
+                self,
+                "v_proj_{}".format(i),
+                nn.Sequential(
+                    nn.LayerNorm(kv_dim),
+                    nn.Linear(
+                        kv_dim, self.num_heads * self.head_dim, bias=attention_bias
+                    ),
+                ),
+            )
+        self.o_proj = nn.Linear(
+            self.num_heads * self.head_dim, q_dim, bias=attention_bias
+        )
+    def forward(
+        self,
+        queries,
+        *vision_latents_attention_mask_list,
+    ):
+        vision_latents_list = vision_latents_attention_mask_list[: self.num_of_kvs]
+        attention_mask_list = vision_latents_attention_mask_list[self.num_of_kvs :]
+        bsz, q_len, _ = queries.size()
+        query_states = self.q_proj(queries)
+        key_states = torch.cat(
+            [
+                getattr(self, "k_proj_{}".format(i))(vision_latents_list[i])
+                for i in range(self.num_of_kvs)
+            ],
+            dim=1,
+        )
+        value_states = torch.cat(
+            [
+                getattr(self, "v_proj_{}".format(i))(vision_latents_list[i])
+                for i in range(self.num_of_kvs)
+            ],
+            dim=1,
+        )
+        v_len = key_states.shape[1]
+        query_states = query_states.view(
+            bsz, q_len, self.num_heads, self.head_dim
+        ).transpose(1, 2)
+        key_states = key_states.view(
+            bsz, v_len, self.num_heads, self.head_dim
+        ).transpose(1, 2)
+        value_states = value_states.view(
+            bsz, v_len, self.num_heads, self.head_dim
+        ).transpose(1, 2)
+        # if kv_weight is not None:
+        #     kv_weight = kv_weight.unsqueeze(1).expand(-1, self.num_heads, -1, -1)
+        attention_mask = torch.cat(attention_mask_list, dim=-1)
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, v_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, v_len)}, but is {attention_mask.size()}"
+                )
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+        )
+        # attn_output = spda(
+        #     query_states,
+        #     key_states,
+        #     value_states,
+        #     attn_mask=attention_mask,
+        #     additional_score=kv_weight
+        # )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_dim)
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+class MLP(nn.Module):
+    def __init__(self, d_in, d_hidden, d_out):
+        super().__init__()
+        self.linear_1 = nn.Linear(d_in, d_hidden, bias=False)
+        self.act = nn.GELU()
+        self.linear_2 = nn.Linear(d_hidden, d_out, bias=False)
+    def forward(self, x):
+        return self.linear_2(self.act(self.linear_1(x)))
+class VisionCrossAttentionLayer(nn.Module):
+    def __init__(
+        self,
+        q_dim,
+        context_dim,
+        kv_dim_list,
+        kv_size_list,
+        hidden_dim=1024,
+        layer_idx=0,
+    ):
+        super().__init__()
+        num_heads = 16
+        self.num_of_kvs = len(kv_dim_list)
+        self.proj_context = nn.Linear(context_dim, hidden_dim, bias=False)
+        self.proj_in = nn.Linear(q_dim + hidden_dim, hidden_dim, bias=False)
+        # if self.num_of_kvs > 1:
+        #     self.weight_mlp = MLP(q_dim+hidden_dim, hidden_dim, self.num_of_kvs)
+        #     self.tower_weight = nn.Parameter(torch.zeros((self.num_of_kvs)))
+        self.proj_out = MLP(hidden_dim, hidden_dim, q_dim)
+        self.norm = nn.LayerNorm(hidden_dim)
+        self.cross_attn = MultiKVCrossAttention(
+            hidden_dim, kv_dim_list, hidden_dim, num_heads
+        )
+        self.kv_size_list = kv_size_list
+        for i, kv_size in enumerate(kv_size_list):
+            if kv_size > 1:
+                setattr(
+                    self,
+                    "pos_embed_{}".format(i),
+                    nn.Parameter(torch.randn(kv_size**2, hidden_dim)),
+                )
+                # self.register_buffer("pos_embed_{}".format(i), torch.from_numpy(get_2d_sincos_pos_embed(hidden_dim, kv_size)).float(), persistent=False)
+    def forward(
+        self,
+        queries,
+        context_feature,
+        *vision_latents_attention_mask_list,
+    ) -> torch.FloatTensor:
+        residual = queries
+        # queries = self.proj_in(queries)
+        context_feature = self.proj_context(context_feature)
+        # queries = queries + context_feature
+        queries = torch.cat([queries, context_feature], -1)
+        # if self.num_of_kvs > 1:
+        #     kv_weight = self.weight_mlp(queries) # B * 1 * num_tower
+        #     kv_weight = kv_weight + self.tower_weight.view(1, 1, -1)
+        #     kv_weight = kv_weight.softmax(-1)
+        #     kv_number_list = [size**2 for size in self.kv_size_list]
+        #     kv_weight = torch.repeat_interleave(kv_weight, torch.tensor(kv_number_list).to(kv_weight.device), dim=-1)
+        # else:
+        #     kv_weight = None
+        queries = self.proj_in(queries)
+        vision_latents_list = vision_latents_attention_mask_list[: self.num_of_kvs]
+        attention_mask_list = vision_latents_attention_mask_list[self.num_of_kvs :]
+        attention_mask_list_reshaped = []
+        if attention_mask_list is not None:
+            for attention_mask in attention_mask_list:
+                attention_mask = attention_mask.view(attention_mask.shape[0], 1, 1, -1)
+                attention_mask = attention_mask.expand(-1, -1, queries.shape[1], -1)
+                attention_mask_list_reshaped.append(attention_mask)
+        vision_latents_pos_list = []
+        for i, vision_latents in enumerate(vision_latents_list):
+            if vision_latents.shape[1] > 1:
+                vision_latents_pos_list.append(
+                    vision_latents
+                    + getattr(self, "pos_embed_{}".format(i))[None, :, :].to(
+                        vision_latents.dtype
+                    )
+                )
+            else:
+                vision_latents_pos_list.append(vision_latents)
+        # Cross Attention
+        attention_output = self.cross_attn(
+            queries, *vision_latents_pos_list, *attention_mask_list_reshaped
+        )
+        # attention_output = (attention_output * combination_weight).sum(2)
+        queries = queries + attention_output
+        queries = self.norm(queries)
+        queries = self.proj_out(queries)
+        queries = queries + residual
+        return queries
+class VisionAggregationLayer(nn.Module):
+    def __init__(
+        self,
+        q_dim,
+        context_dim,
+        kv_dim_list,
+        kv_size_list,
+        hidden_dim=1024,
+        layer_idx=0,
+    ):
+        super().__init__()
+        num_heads = 16
+        self.num_of_kvs = len(kv_dim_list)
+        self.proj_context = nn.Linear(context_dim, hidden_dim, bias=False)
+        self.proj_in = nn.Linear(q_dim + hidden_dim, hidden_dim, bias=False)
+        self.proj_out = MLP(hidden_dim, hidden_dim, q_dim)
+        self.norm = nn.LayerNorm(hidden_dim)
+        if self.num_of_kvs > 1:
+            self.weight_mlp = MLP(q_dim + hidden_dim, hidden_dim, self.num_of_kvs)
+        for i, kv_size in enumerate(kv_size_list):
+            if kv_size > 1:
+                setattr(
+                    self,
+                    "pos_embed_{}".format(i),
+                    nn.Parameter(torch.randn(kv_size**2, hidden_dim)),
+                )
+                setattr(
+                    self,
+                    "aggregate_{}".format(i),
+                    AggregationBlock(
+                        True, hidden_dim, kv_dim_list[i], hidden_dim, num_heads
+                    ),
+                )
+            else:
+                setattr(
+                    self,
+                    "aggregate_{}".format(i),
+                    AggregationBlock(
+                        False, hidden_dim, kv_dim_list[i], hidden_dim, num_heads
+                    ),
+                )
+    def forward(
+        self,
+        queries,
+        context_feature,
+        *vision_latents_attention_mask_list,
+    ) -> torch.FloatTensor:
+        residual = queries
+        # queries = self.proj_in(queries)
+        context_feature = self.proj_context(context_feature)
+        # queries = queries + context_feature
+        queries = torch.cat([queries, context_feature], -1)
+        if self.num_of_kvs > 1:
+            combination_weight = self.weight_mlp(queries).softmax(
+                -1
+            )  # B * 1 * num_tower
+            combination_weight = combination_weight.unsqueeze(-1)
+        else:
+            combination_weight = 1
+        queries = self.proj_in(queries)
+        vision_latents_list = vision_latents_attention_mask_list[: self.num_of_kvs]
+        attention_mask_list = vision_latents_attention_mask_list[self.num_of_kvs :]
+        attention_mask_list_reshaped = []
+        if attention_mask_list is not None:
+            for attention_mask in attention_mask_list:
+                attention_mask = attention_mask.view(attention_mask.shape[0], 1, 1, -1)
+                attention_mask = attention_mask.expand(-1, -1, queries.shape[1], -1)
+                attention_mask_list_reshaped.append(attention_mask)
+        vision_latents_pos_list = []
+        for i, vision_latents in enumerate(vision_latents_list):
+            if vision_latents.shape[1] > 1:
+                vision_latents_pos_list.append(
+                    vision_latents
+                    + getattr(self, "pos_embed_{}".format(i))[None, :, :].to(
+                        vision_latents.dtype
+                    )
+                )
+            else:
+                vision_latents_pos_list.append(vision_latents)
+        aggregated_vision_latents_list = []
+        for i, (vision_latents, attention_mask) in enumerate(
+            zip(vision_latents_pos_list, attention_mask_list_reshaped)
+        ):
+            aggregated_vision_latents_list.append(
+                getattr(self, "aggregate_{}".format(i))(
+                    vision_latents, queries, attention_mask
+                )
+            )
+        aggregated_vision_latents = torch.stack(aggregated_vision_latents_list, 2)
+        queries = queries + (aggregated_vision_latents * combination_weight).sum(2)
+        queries = self.norm(queries)
+        queries = self.proj_out(queries)
+        queries = queries + residual
+        return queries
+class VisionTokenSampler(nn.Module):
+    def __init__(
+        self,
+        q_dim,
+        context_dim,
+        kv_dim_list,
+        kv_size_list,
+        vision_hidden_size,
+        num_of_layers=1,
+        layer_type="joint",
+    ):
+        super().__init__()
+        assert layer_type in ["joint", "sep"]
+        if layer_type == "joint":
+            self.layers = nn.ModuleList(
+                [
+                    VisionCrossAttentionLayer(
+                        q_dim,
+                        context_dim,
+                        kv_dim_list,
+                        kv_size_list,
+                        vision_hidden_size,
+                        idx,
+                    )
+                    for idx in range(num_of_layers)
+                ]
+            )
+        else:
+            self.layers = nn.ModuleList(
+                [
+                    VisionAggregationLayer(
+                        q_dim,
+                        context_dim,
+                        kv_dim_list,
+                        kv_size_list,
+                        vision_hidden_size,
+                        idx,
+                    )
+                    for idx in range(num_of_layers)
+                ]
+            )
+    def forward(self, queries, context_feature, *vision_latents_attention_mask_list):
+        for layer in self.layers:
+            queries = layer(
+                queries, context_feature, *vision_latents_attention_mask_list
+            )
+        return queries