add initial files

Browse files

Files changed (13) hide show

README.md +136 -0
cambrian_arch.py +1712 -0
config.json +88 -0
merges.txt +0 -0
model.safetensors +3 -0
modeling.py +471 -0
multimodal_encoder_builder.py +368 -0
multimodal_projector_builder.py +52 -0
special_tokens_map.json +20 -0
tokenizer.json +0 -0
tokenizer_config.json +53 -0
vision_sampler.py +566 -0
vocab.json +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,136 @@

+---
+datasets:
+- shenxq/OneVision
+- shenxq/VideoChat2
+base_model:
+- Vision-CAIR/LongVU_Qwen2_7B_img
+pipeline_tag: video-text-to-text
+model-index:
+- name: llava-onevision-qwen-7b-ov
+  results:
+  - task:
+      type: multimodal
+    dataset:
+      name: EgoSchema
+      type: egoschema
+    metrics:
+    - type: accuracy
+      value: 67.6
+      name: accuracy
+      verified: true
+  - task:
+      type: multimodal
+    dataset:
+      name: MLVU
+      type: mlvu
+    metrics:
+    - type: accuracy
+      value: 65.4
+      name: accuracy
+      verified: true
+  - task:
+      type: multimodal
+    dataset:
+      name: MVBench
+      type: mvbench
+    metrics:
+    - type: accuracy
+      value: 66.9
+      name: accuracy
+      verified: true
+  - task:
+      type: multimodal
+    dataset:
+      name: VideoMME
+      type: videomme
+    metrics:
+    - type: accuracy
+      value: 60.6
+      name: accuracy
+      verified: true
+---
+# LongVU
+This repository contains the model based on Qwen2-7B as presented in [LongVU: Spatiotemporal Adaptive Compression for Long Video-Language Understanding](https://huggingface.co/papers/2410.17434).
+Play with the model on the [HF demo](https://huggingface.co/spaces/Vision-CAIR/LongVU).
+<div align="left">
+    <a href='https://vision-cair.github.io/LongVU'><img src="https://longvu.s3.amazonaws.com/assets/demo.gif" alt="Demo GIF" style="width: 100%; max-width: 650px;"></a>
+</div>
+# Use
+We provide the simple generation process for using our model. For more details, you could refer to [Github](https://github.com/Vision-CAIR/LongVU)
+```python
+# git clone https://github.com/Vision-CAIR/LongVU
+import numpy as np
+import torch
+from longvu.builder import load_pretrained_model
+from longvu.constants import (
+    DEFAULT_IMAGE_TOKEN,
+    IMAGE_TOKEN_INDEX,
+)
+from longvu.conversation import conv_templates, SeparatorStyle
+from longvu.mm_datautils import (
+    KeywordsStoppingCriteria,
+    process_images,
+    tokenizer_image_token,
+)
+from decord import cpu, VideoReader
+tokenizer, model, image_processor, context_len = load_pretrained_model(
+    "./checkpoints/longvu_qwen", None, "cambrian_qwen",
+)
+model.eval()
+video_path = "./examples/video1.mp4"
+qs = "Describe this video in detail"
+vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+fps = float(vr.get_avg_fps())
+frame_indices = np.array([i for i in range(0, len(vr), round(fps),)])
+video = []
+for frame_index in frame_indices:
+    img = vr[frame_index].asnumpy()
+    video.append(img)
+video = np.stack(video)
+image_sizes = [video[0].shape[:2]]
+video = process_images(video, image_processor, model.config)
+video = [item.unsqueeze(0) for item in video]
+qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
+conv = conv_templates["qwen"].copy()
+conv.append_message(conv.roles[0], qs)
+conv.append_message(conv.roles[1], None)
+prompt = conv.get_prompt()
+input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(model.device)
+stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+keywords = [stop_str]
+stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+with torch.inference_mode():
+    output_ids = model.generate(
+        input_ids,
+        images=video,
+        image_sizes=image_sizes,
+        do_sample=False,
+        temperature=0.2,
+        max_new_tokens=128,
+        use_cache=True,
+        stopping_criteria=[stopping_criteria],
+    )
+pred = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+```
+# Citation
+```
+@article{shen2024longvu,
+    title={LongVU: Spatiotemporal Adaptive Compression for Long Video-Language Understanding},
+    author={Shen, Xiaoqian and Xiong, Yunyang and Zhao, Changsheng and Wu, Lemeng and Chen, Jun and Zhu, Chenchen and Liu, Zechun and Xiao, Fanyi and Varadarajan, Balakrishnan and Bordes, Florian and Liu, Zhuang and Xu, Hu and J. Kim, Hyunwoo and Soran, Bilge and Krishnamoorthi, Raghuraman and Elhoseiny, Mohamed and Chandra, Vikas},
+    journal={arXiv:2410.17434},
+    year={2024}
+  }
+```

cambrian_arch.py ADDED Viewed

	@@ -0,0 +1,1712 @@

+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+import math
+import random
+from abc import ABC, abstractmethod
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# define the constants
+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+LOGDIR = "."
+# Model Constants
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+IMAGE_PLACEHOLDER = "<image-placeholder>"
+from .multimodal_encoder_builder import build_vision_tower_aux_list
+from .multimodal_projector_builder import build_vision_projector
+from .vision_sampler import VisionTokenSampler
+IS_XLA_AVAILABLE = False
+class CambrianMetaModel:
+    def __init__(self, config):
+        super(CambrianMetaModel, self).__init__(config)
+        if hasattr(config, "mm_vision_tower_aux_list"):
+            projector_type = getattr(config, "mm_projector_type", "linear")
+            if projector_type == "sva":
+                vision_hidden_size = config.vision_hidden_size
+                num_query_group = config.num_query_group
+                query_num_list = config.query_num_list
+                connector_only = config.connector_only
+                connector_depth = config.connector_depth
+                self.vision_tower_aux_list = build_vision_tower_aux_list(
+                    config, delay_load=True
+                )
+                self.mm_projector = nn.Sequential(
+                    nn.Linear(vision_hidden_size * num_query_group, config.hidden_size),
+                    nn.GELU(),
+                    nn.Linear(config.hidden_size, config.hidden_size),
+                )
+                image_token_len = config.image_token_len
+                vision_tower_aux_token_len_list = (
+                    self.config.mm_vision_tower_aux_token_len_list
+                )
+                cross_att_token_len_list = [
+                    int(vision_tower_aux_token_len**0.5) // int(image_token_len**0.5)
+                    for vision_tower_aux_token_len in vision_tower_aux_token_len_list
+                ]
+                for aux_i, vision_tower_aux in enumerate(self.vision_tower_aux_list):
+                    setattr(
+                        self,
+                        "mm_projector_aux_{}".format(aux_i),
+                        nn.Sequential(
+                            nn.Linear(vision_tower_aux.hidden_size, vision_hidden_size),
+                            nn.GELU(),
+                            nn.Linear(vision_hidden_size, vision_hidden_size),
+                            nn.LayerNorm(vision_hidden_size),
+                        ),
+                    )
+                for query_group_i in range(num_query_group):
+                    cross_att_token_len_list = [
+                        int(vision_tower_aux_token_len**0.5)
+                        // int(query_num_list[query_group_i] ** 0.5)
+                        for vision_tower_aux_token_len in vision_tower_aux_token_len_list
+                    ]
+                    setattr(
+                        self,
+                        "vision_sampler_{}".format(query_group_i),
+                        VisionTokenSampler(
+                            vision_hidden_size,
+                            vision_hidden_size,
+                            [vision_hidden_size] * len(self.vision_tower_aux_list),
+                            cross_att_token_len_list,
+                            vision_hidden_size,
+                            connector_depth,
+                        ),
+                    )
+                if not connector_only:
+                    num_of_vision_sampler_layers = (
+                        config.num_of_vision_sampler_layers
+                    ) = config.num_of_vision_sampler_layers
+                    config.start_of_vision_sampler_layers = (
+                        config.start_of_vision_sampler_layers
+                    )
+                    config.stride_of_vision_sampler_layers = (
+                        config.stride_of_vision_sampler_layers
+                    )
+                    cross_att_token_len_list = [
+                        int(vision_tower_aux_token_len**0.5)
+                        // int(image_token_len**0.5)
+                        for vision_tower_aux_token_len in vision_tower_aux_token_len_list
+                    ]
+                    self.vision_sampler_layers = nn.ModuleList(
+                        [
+                            VisionTokenSampler(
+                                config.hidden_size,
+                                vision_hidden_size,
+                                [vision_hidden_size] * len(self.vision_tower_aux_list),
+                                cross_att_token_len_list,
+                                vision_hidden_size,
+                                1,
+                            )
+                            for layer_idx in range(0, num_of_vision_sampler_layers)
+                        ]
+                    )
+                self.vision_query = nn.Parameter(
+                    torch.randn((num_query_group, vision_hidden_size), dtype=self.dtype)
+                )
+                self.image_newline = nn.Parameter(
+                    torch.empty(config.hidden_size, dtype=self.dtype)
+                )
+                self.frame_pos = torch.stack(
+                    [
+                        1
+                        / torch.pow(
+                            torch.tensor(10000),
+                            torch.tensor(2 * (hid_j // 2) / config.hidden_size),
+                        )
+                        for hid_j in range(config.hidden_size)
+                    ]
+                )
+            else:
+                self.vision_tower_aux_list = build_vision_tower_aux_list(
+                    config, delay_load=True
+                )
+                config.mm_hidden_size = sum(
+                    [
+                        vision_tower_aux.hidden_size
+                        for vision_tower_aux in self.vision_tower_aux_list
+                    ]
+                )
+                self.mm_projector = build_vision_projector(config)
+                self.image_newline = nn.Parameter(
+                    torch.empty(config.hidden_size, dtype=self.dtype)
+                )
+    def get_frame_pos(self, time_range):
+        frame_pos = self.frame_pos.reshape(1, -1) * time_range.reshape(-1, 1).to(
+            self.frame_pos.device
+        )
+        frame_pos[:, 0::2] = torch.sin(frame_pos[:, 0::2])
+        frame_pos[:, 1::2] = torch.cos(frame_pos[:, 0::2])
+        frame_pos = frame_pos.unsqueeze(1)
+        return frame_pos
+    # def get_vision_tower(self):
+    #     vision_tower = getattr(self, 'vision_tower', None)
+    #     if type(vision_tower) is list:
+    #         vision_tower = vision_tower[0]
+    #     return vision_tower
+    def get_vision_tower_aux_list(self):
+        vision_tower_aux_list = getattr(self, "vision_tower_aux_list", None)
+        return vision_tower_aux_list
+    def initialize_vision_modules(self, model_args, fsdp=None):
+        # vision_tower = model_args.vision_tower
+        num_query_group = model_args.num_query_group
+        query_num_list = model_args.query_num_list
+        vision_hidden_size = model_args.vision_hidden_size
+        vision_tower_aux_list = model_args.vision_tower_aux_list
+        vision_tower_aux_token_len_list = model_args.vision_tower_aux_token_len_list
+        image_token_len = model_args.image_token_len
+        mm_vision_select_layer = model_args.mm_vision_select_layer
+        mm_vision_select_feature = model_args.mm_vision_select_feature
+        pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
+        connector_only = model_args.connector_only
+        connector_depth = model_args.connector_depth
+        # self.config.mm_vision_tower = vision_tower
+        self.config.image_token_len = image_token_len
+        self.config.num_query_group = num_query_group
+        self.config.query_num_list = query_num_list
+        assert num_query_group == len(query_num_list)
+        self.config.connector_depth = connector_depth
+        self.config.mm_vision_tower_aux_list = vision_tower_aux_list
+        self.config.mm_vision_tower_aux_token_len_list = vision_tower_aux_token_len_list
+        self.config.connector_only = connector_only
+        self.config.highres_connect = model_args.highres_connect
+        self.config.highres = model_args.highres
+        self.config.frame_pos = model_args.frame_pos
+        self.config.lowres_token = model_args.lowres_token
+        self.config.connect_layer = model_args.connect_layer
+        self.config.dino_threshold = getattr(model_args, "dino_threshold", 0.83)
+        self.config.drop_threshold = getattr(model_args, "drop_threshold", 0.6)
+        self.config.is_image_newline = getattr(model_args, "is_image_newline", True)
+        if self.get_vision_tower_aux_list() is None:
+            vision_tower_aux_list = build_vision_tower_aux_list(model_args)
+            if model_args.unfreeze_mm_vision_tower:
+                self.vision_tower_aux_list = nn.ModuleList(vision_tower_aux_list)
+            else:
+                self.vision_tower_aux_list = vision_tower_aux_list
+        else:
+            vision_tower_aux_list = self.vision_tower_aux_list
+            for vision_tower_aux in vision_tower_aux_list:
+                vision_tower_aux.load_model()
+        self.config.use_mm_proj = True
+        self.config.mm_projector_type = getattr(
+            model_args, "mm_projector_type", "linear"
+        )
+        self.config.vision_hidden_size = vision_hidden_size
+        self.config.mm_vision_select_layer = mm_vision_select_layer
+        self.config.mm_vision_select_feature = mm_vision_select_feature
+        if getattr(self, "mm_projector", None) is None:
+            if self.config.mm_projector_type == "sva":
+                self.mm_projector = nn.Sequential(
+                    nn.Linear(
+                        vision_hidden_size * num_query_group, self.config.hidden_size
+                    ),
+                    nn.GELU(),
+                    nn.Linear(self.config.hidden_size, self.config.hidden_size),
+                )
+                for aux_i, vision_tower_aux in enumerate(vision_tower_aux_list):
+                    setattr(
+                        self,
+                        "mm_projector_aux_{}".format(aux_i),
+                        nn.Sequential(
+                            nn.Linear(vision_tower_aux.hidden_size, vision_hidden_size),
+                            nn.GELU(),
+                            nn.Linear(vision_hidden_size, vision_hidden_size),
+                            nn.LayerNorm(vision_hidden_size),
+                        ),
+                    )
+                # vision sampler for each group of query as the connector before the LLM
+                for query_group_i in range(num_query_group):
+                    cross_att_token_len_list = [
+                        int(vision_tower_aux_token_len**0.5)
+                        // int(query_num_list[query_group_i] ** 0.5)
+                        for vision_tower_aux_token_len in vision_tower_aux_token_len_list
+                    ]
+                    setattr(
+                        self,
+                        "vision_sampler_{}".format(query_group_i),
+                        VisionTokenSampler(
+                            vision_hidden_size,
+                            vision_hidden_size,
+                            [vision_hidden_size] * len(vision_tower_aux_list),
+                            cross_att_token_len_list,
+                            vision_hidden_size,
+                            connector_depth,
+                        ),
+                    )
+                # sampler layers within LLM
+                if not connector_only:
+                    num_of_vision_sampler_layers = (
+                        self.config.num_of_vision_sampler_layers
+                    ) = model_args.num_of_vision_sampler_layers
+                    self.config.start_of_vision_sampler_layers = (
+                        model_args.start_of_vision_sampler_layers
+                    )
+                    self.config.stride_of_vision_sampler_layers = (
+                        model_args.stride_of_vision_sampler_layers
+                    )
+                    cross_att_token_len_list = [
+                        int(vision_tower_aux_token_len**0.5)
+                        // int(image_token_len**0.5)
+                        for vision_tower_aux_token_len in vision_tower_aux_token_len_list
+                    ]
+                    self.vision_sampler_layers = nn.ModuleList(
+                        [
+                            VisionTokenSampler(
+                                self.config.hidden_size,
+                                vision_hidden_size,
+                                [vision_hidden_size] * len(vision_tower_aux_list),
+                                cross_att_token_len_list,
+                                vision_hidden_size,
+                                1,
+                            )
+                            for layer_idx in range(0, num_of_vision_sampler_layers)
+                        ]
+                    )
+                vision_embed_std = 1 / torch.sqrt(
+                    torch.tensor(vision_hidden_size, dtype=self.dtype)
+                )
+                self.vision_query = nn.Parameter(
+                    torch.randn((num_query_group, vision_hidden_size), dtype=self.dtype)
+                    * vision_embed_std
+                )
+                embed_std = 1 / torch.sqrt(
+                    torch.tensor(self.config.hidden_size, dtype=self.dtype)
+                )
+                self.image_newline = nn.Parameter(
+                    torch.randn(self.config.hidden_size, dtype=self.dtype) * embed_std
+                )
+            else:
+                self.config.mm_hidden_size = sum(
+                    [
+                        vision_tower_aux.hidden_size
+                        for vision_tower_aux in vision_tower_aux_list
+                    ]
+                )
+                self.mm_projector = build_vision_projector(self.config)
+                embed_std = 1 / torch.sqrt(
+                    torch.tensor(self.config.hidden_size, dtype=self.dtype)
+                )
+                self.image_newline = nn.Parameter(
+                    torch.randn(self.config.hidden_size, dtype=self.dtype) * embed_std
+                )
+        else:
+            # In case it is frozen by LoRA
+            for p in self.mm_projector.parameters():
+                p.requires_grad = True
+        if pretrain_mm_mlp_adapter is not None:
+            mm_projector_weights = torch.load(
+                pretrain_mm_mlp_adapter, map_location="cpu"
+            )
+            def get_w(weights, keyword):
+                return {
+                    k.split(keyword + ".")[1]: v
+                    for k, v in weights.items()
+                    if keyword + "." in k
+                }
+            self.mm_projector.load_state_dict(
+                get_w(mm_projector_weights, "mm_projector"), strict=True
+            )
+            if self.config.mm_projector_type == "sva":
+                for aux_i in range(len(vision_tower_aux_list)):
+                    getattr(self, "mm_projector_aux_{}".format(aux_i)).load_state_dict(
+                        get_w(
+                            mm_projector_weights, "mm_projector_aux_{}".format(aux_i)
+                        ),
+                        strict=True,
+                    )
+                for query_group_i in range(num_query_group):
+                    getattr(
+                        self, "vision_sampler_{}".format(query_group_i)
+                    ).load_state_dict(
+                        get_w(
+                            mm_projector_weights,
+                            "vision_sampler_{}".format(query_group_i),
+                        ),
+                        strict=True,
+                    )
+                if not connector_only:
+                    self.vision_sampler_layers.load_state_dict(
+                        get_w(mm_projector_weights, "vision_sampler_layers"),
+                        strict=True,
+                    )
+                self.vision_query.data = mm_projector_weights["model.vision_query"]
+            self.image_newline.data = mm_projector_weights["model.image_newline"]
+def unmask_attention_mask(mask, original_size):
+    original_w, original_h = original_size
+    cur_h, cur_w = mask.shape[1:3]
+    original_aspect_ratio = original_w / original_h
+    current_aspect_ratio = cur_w / cur_h
+    if original_aspect_ratio > current_aspect_ratio:
+        scale_factor = cur_w / original_w
+        new_height = int(original_h * scale_factor)
+        padding = (cur_h - new_height) // 2
+        if padding > 0:
+            mask[:, :padding, :] = 0
+            mask[:, -padding:, :] = 0
+        return mask
+    else:
+        scale_factor = cur_h / original_h
+        new_width = int(original_w * scale_factor)
+        padding = (cur_w - new_width) // 2
+        if padding > 0:
+            mask[:, :, :padding] = 0
+            mask[:, :, -padding:] = 0
+        return mask
+def unpad_image(tensor, original_size):
+    """
+    Unpads a PyTorch tensor of a padded and resized image.
+    Args:
+    tensor (torch.Tensor): The image tensor, assumed to be in CxHxW format.
+    original_size (tuple): The original size of the image (height, width).
+    Returns:
+    torch.Tensor: The unpadded image tensor.
+    """
+    original_width, original_height = original_size
+    current_height, current_width = tensor.shape[1:3]
+    original_aspect_ratio = original_width / original_height
+    current_aspect_ratio = current_width / current_height
+    if original_aspect_ratio > current_aspect_ratio:
+        scale_factor = current_width / original_width
+        new_height = int(original_height * scale_factor)
+        padding = (current_height - new_height) // 2
+        unpadded_tensor = tensor[:, padding : current_height - padding, :]
+        # if 0 in unpadded_tensor.shape:
+        #     print(f"scale_factor: {scale_factor}, new_height: {new_height}, padding: {padding}, original_width: {original_width}, original_height: {original_height}")
+    else:
+        scale_factor = current_height / original_height
+        new_width = int(original_width * scale_factor)
+        padding = (current_width - new_width) // 2
+        unpadded_tensor = tensor[:, :, padding : current_width - padding]
+        # if 0 in unpadded_tensor.shape:
+        #     print(f"scale_factor: {scale_factor}, new_width: {new_width}, padding: {padding}, original_width: {original_width}, original_height: {original_height}")
+    return unpadded_tensor
+class CambrianMetaForCausalLM(ABC):
+    @abstractmethod
+    def get_model(self):
+        pass
+    # def get_vision_tower(self):
+    #     return self.get_model().get_vision_tower()
+    def get_vision_tower_aux_list(self):
+        return self.get_model().get_vision_tower_aux_list()
+    def rearrange_vision_tower_features_train(
+        self,
+        vision_tower_aux_feature_list,
+        vision_tower_aux_attention_masks_list,
+        query_side_len,
+    ):
+        vision_tower_aux_feature_rearranged_list = []
+        vision_tower_aux_attention_masks_rearranged_list = []
+        bs = vision_tower_aux_feature_list[0].shape[0]
+        for vision_tower_aux_feature, vision_tower_aux_attention_masks in zip(
+            vision_tower_aux_feature_list, vision_tower_aux_attention_masks_list
+        ):
+            aux_height = aux_width = int(vision_tower_aux_feature.shape[1] ** 0.5)
+            assert (aux_height // query_side_len) * query_side_len == aux_height
+            reduce_factor = aux_height // query_side_len
+            vision_tower_aux_feature_rearranged = vision_tower_aux_feature.view(
+                bs, query_side_len, reduce_factor, query_side_len, reduce_factor, -1
+            )
+            vision_tower_aux_feature_rearranged = (
+                vision_tower_aux_feature_rearranged.permute(0, 1, 3, 2, 4, 5)
+                .contiguous()
+                .flatten(0, 2)
+                .flatten(1, 2)
+            )
+            vision_tower_aux_attention_masks_rearranged = (
+                vision_tower_aux_attention_masks.view(
+                    bs * query_side_len * query_side_len, reduce_factor * reduce_factor
+                )
+            )
+            vision_tower_aux_feature_rearranged_list.append(
+                vision_tower_aux_feature_rearranged
+            )
+            vision_tower_aux_attention_masks_rearranged_list.append(
+                vision_tower_aux_attention_masks_rearranged
+            )
+        return (
+            vision_tower_aux_feature_rearranged_list,
+            vision_tower_aux_attention_masks_rearranged_list,
+        )
+    def rearrange_vision_tower_features_inference(
+        self, vision_tower_aux_feature_list, query_side_len, image_sizes, unpad=False
+    ):
+        vision_tower_aux_feature_rearranged_list = []
+        vision_tower_aux_attention_masks_rearranged_list = []
+        bs = vision_tower_aux_feature_list[0].shape[0]
+        for vision_tower_aux_feature in vision_tower_aux_feature_list:
+            aux_height = aux_width = int(vision_tower_aux_feature.shape[1] ** 0.5)
+            assert (aux_height // query_side_len) * query_side_len == aux_height
+            reduce_factor = aux_height // query_side_len
+            vision_tower_aux_feature_rearranged = []
+            vision_tower_aux_attention_masks_rearranged = []
+            for batch_i in range(bs):
+                image_size = image_sizes[batch_i]
+                cur_vision_tower_aux_feature = vision_tower_aux_feature[batch_i]
+                cur_vision_tower_aux_attention_masks_rearranged = torch.ones(
+                    (1, aux_height, aux_width),
+                    dtype=torch.bool,
+                    device=cur_vision_tower_aux_feature.device,
+                )
+                cur_vision_tower_aux_feature_rearranged = (
+                    cur_vision_tower_aux_feature.view(
+                        1,
+                        query_side_len,
+                        reduce_factor,
+                        query_side_len,
+                        reduce_factor,
+                        -1,
+                    )
+                )
+                cur_vision_tower_aux_feature_rearranged = (
+                    cur_vision_tower_aux_feature_rearranged.permute(
+                        0, 1, 3, 2, 4, 5
+                    ).contiguous()
+                )
+                if unpad:
+                    cur_vision_tower_aux_feature_rearranged = unpad_image(
+                        cur_vision_tower_aux_feature_rearranged, image_size
+                    )
+                cur_vision_tower_aux_feature_rearranged = (
+                    cur_vision_tower_aux_feature_rearranged.flatten(0, 2).flatten(1, 2)
+                )  # query_side_len*query_side_len X reduce_factor*reduce_factor X C
+                cur_vision_tower_aux_attention_masks_rearranged = unmask_attention_mask(
+                    cur_vision_tower_aux_attention_masks_rearranged, image_size
+                )
+                cur_vision_tower_aux_attention_masks_rearranged = (
+                    cur_vision_tower_aux_attention_masks_rearranged.view(
+                        1, query_side_len, reduce_factor, query_side_len, reduce_factor
+                    )
+                    .permute(0, 1, 3, 2, 4)
+                    .contiguous()
+                )
+                if unpad:
+                    cur_vision_tower_aux_attention_masks_rearranged = unpad_image(
+                        cur_vision_tower_aux_attention_masks_rearranged, image_size
+                    )
+                cur_vision_tower_aux_attention_masks_rearranged = (
+                    cur_vision_tower_aux_attention_masks_rearranged.flatten(
+                        0, 2
+                    ).flatten(1, 2)
+                )
+                cur_vision_tower_aux_attention_masks_rearranged[
+                    cur_vision_tower_aux_attention_masks_rearranged.sum(-1) == 0
+                ] = True
+                vision_tower_aux_feature_rearranged.append(
+                    cur_vision_tower_aux_feature_rearranged
+                )
+                vision_tower_aux_attention_masks_rearranged.append(
+                    cur_vision_tower_aux_attention_masks_rearranged
+                )
+            vision_tower_aux_feature_rearranged = torch.cat(
+                vision_tower_aux_feature_rearranged, 0
+            )
+            vision_tower_aux_attention_masks_rearranged = torch.cat(
+                vision_tower_aux_attention_masks_rearranged, 0
+            )
+            vision_tower_aux_feature_rearranged_list.append(
+                vision_tower_aux_feature_rearranged
+            )
+            vision_tower_aux_attention_masks_rearranged_list.append(
+                vision_tower_aux_attention_masks_rearranged
+            )
+        return (
+            vision_tower_aux_feature_rearranged_list,
+            vision_tower_aux_attention_masks_rearranged_list,
+        )
+    def encode_images(self, image_aux_list, encode_type=None):
+        vision_tower_aux_list = self.get_model().get_vision_tower_aux_list()
+        image_aux_features_list = []
+        chunk_size = 64
+        if encode_type == "dino":
+            image_aux = image_aux_list[-1]
+            vision_tower_aux = vision_tower_aux_list[-1]
+            if image_aux.shape[0] > chunk_size:
+                image_aux_features_chunks = []
+                for start_idx in range(0, image_aux.shape[0], chunk_size):
+                    end_idx = min(start_idx + chunk_size, image_aux.shape[0])
+                    chunk = image_aux[start_idx:end_idx]
+                    image_aux_features_chunk = vision_tower_aux(chunk)
+                    image_aux_features_chunks.append(image_aux_features_chunk)
+                image_aux_features = torch.cat(image_aux_features_chunks, dim=0)
+            else:
+                image_aux_features = vision_tower_aux(image_aux)
+            return image_aux_features
+        elif encode_type == "siglip":
+            image_aux = image_aux_list[0]
+            vision_tower_aux = vision_tower_aux_list[0]
+            if image_aux.shape[0] > chunk_size:
+                image_aux_features_chunks = []
+                for start_idx in range(0, image_aux.shape[0], chunk_size):
+                    end_idx = min(start_idx + chunk_size, image_aux.shape[0])
+                    chunk = image_aux[start_idx:end_idx]
+                    image_aux_features_chunk = vision_tower_aux(chunk)
+                    image_aux_features_chunks.append(image_aux_features_chunk)
+                image_aux_features = torch.cat(image_aux_features_chunks, dim=0)
+            else:
+                image_aux_features = vision_tower_aux(image_aux)
+            return image_aux_features
+        else:
+            for image_aux, vision_tower_aux in zip(
+                image_aux_list, vision_tower_aux_list
+            ):
+                if image_aux.shape[0] > chunk_size:
+                    image_aux_features_chunks = []
+                    for start_idx in range(0, image_aux.shape[0], chunk_size):
+                        end_idx = min(start_idx + chunk_size, image_aux.shape[0])
+                        chunk = image_aux[start_idx:end_idx]
+                        image_aux_features_chunk = vision_tower_aux(chunk)
+                        image_aux_features_chunks.append(image_aux_features_chunk)
+                    image_aux_features = torch.cat(image_aux_features_chunks, dim=0)
+                else:
+                    image_aux_features = vision_tower_aux(image_aux)
+                image_aux_features_list.append(image_aux_features)
+            return image_aux_features_list
+    def select_frame(
+        self,
+        feature_list,
+        split_sizes,
+        input_ids,
+        new_image_aux_list,
+        image_sizes,
+        window_size=16,
+        threshold=0.83,
+    ):
+        dino_features_batch = torch.split(feature_list, split_sizes, dim=0)
+        new_image_aux_batch_0 = torch.split(new_image_aux_list[0], split_sizes, dim=0)
+        new_image_aux_batch_1 = torch.split(new_image_aux_list[1], split_sizes, dim=0)
+        new_split_sizes = []
+        selected_frames_all_0 = []
+        selected_frames_all_1 = []
+        selected_frames_feature_all = []
+        selected_frame_indices_all = []
+        for i_batch, frame_features in enumerate(dino_features_batch):
+            try:
+                if "llama" in self.get_model().config.model_type:
+                    text_len = torch.where(input_ids[i_batch] == 128002)[-1][0]
+                else:
+                    text_len = torch.where(input_ids[i_batch] == 151643)[-1][0]
+            except:
+                text_len = len(input_ids[i_batch])
+            original_width, original_height = image_sizes[i_batch]
+            if getattr(self.get_model().config, "highres", False):
+                token_per_frame = self.get_model().config.lowres_token ** 2
+            else:
+                token_per_frame = self.get_model().config.image_token_len
+            # current_height, current_width = token_per_side, token_per_side
+            # original_aspect_ratio = original_width / original_height
+            # current_aspect_ratio = current_width / current_height
+            # if original_aspect_ratio > current_aspect_ratio:
+            #     scale_factor = current_width / original_width
+            #     new_height = int(original_height * scale_factor)
+            #     padding = math.ceil((current_height - new_height) / 2.0)
+            #     token_per_frame = (
+            #         current_height - padding * 2
+            #     ) * token_per_side + token_per_side
+            # else:
+            #     scale_factor = current_height / original_height
+            #     new_width = int(original_width * scale_factor)
+            #     padding = math.ceil((current_width - new_width) / 2.0)
+            #     token_per_frame = (current_width - padding * 2) * token_per_side + (
+            #         current_width - padding * 2
+            #     )
+            # token_per_frame = (
+            #     token_per_side**2 if token_per_frame < 1 else token_per_frame
+            # )
+            max_num_frames = max(
+                1,
+                (
+                    self.get_model().config.tokenizer_model_max_length
+                    - text_len
+                    - getattr(self.get_model().config, "inference_max_length", 16)
+                )
+                // token_per_frame,
+            )
+            if len(frame_features) < max_num_frames:
+                selected_frames_all_0.append(new_image_aux_batch_0[i_batch])
+                selected_frames_all_1.append(new_image_aux_batch_1[i_batch])
+                selected_frames_feature_all.append(frame_features)
+                new_split_sizes.append(len(frame_features))
+                selected_frame_indices_all.append(torch.arange(len(frame_features)))
+                continue
+            num_segments = len(frame_features) // window_size
+            if num_segments == 0:
+                query_feature = frame_features.flatten(1, 2)
+                query_feature = query_feature / torch.norm(
+                    (query_feature), dim=1, keepdim=True
+                )
+                similarities = torch.mean(query_feature @ query_feature.T, dim=1)
+                similarities[len(frame_features) // 2] = 0
+                indices = torch.where(similarities < threshold)[0]
+                selected_frame_indices_all.append(indices)
+                selected_frames_all_0.append(new_image_aux_batch_0[i_batch][indices])
+                selected_frames_all_1.append(new_image_aux_batch_1[i_batch][indices])
+                selected_frames_feature_all.append(frame_features[indices])
+                new_split_sizes.append(len(indices))
+                continue
+            segments_frames_0 = []
+            segments_frames_1 = []
+            segments_features = []
+            for start_idx in range(0, len(frame_features), window_size):
+                end_idx = min(start_idx + window_size, len(frame_features))
+                segments_frames_0.append(
+                    new_image_aux_batch_0[i_batch][start_idx:end_idx]
+                )
+                segments_frames_1.append(
+                    new_image_aux_batch_1[i_batch][start_idx:end_idx]
+                )
+                segments_features.append(frame_features[start_idx:end_idx])
+            selected_frames_0 = []
+            selected_frames_1 = []
+            selected_features = []
+            selected_frame_indices = []
+            for i, segment in enumerate(segments_features):
+                query_feature = segment.flatten(1, 2)
+                query_feature = query_feature / torch.norm(
+                    (query_feature), dim=1, keepdim=True
+                )
+                similarities = torch.mean(query_feature @ query_feature.T, dim=1)
+                similarities[len(segment) // 2] = 0
+                indices = torch.where(similarities < threshold)[0]
+                selected_frames_0.append(segments_frames_0[i][indices])
+                selected_frames_1.append(segments_frames_1[i][indices])
+                selected_features.append(segment[indices])
+                selected_frame_indices.extend(indices + i * window_size)
+            selected_frames_0 = torch.cat(selected_frames_0, dim=0)
+            selected_frames_1 = torch.cat(selected_frames_1, dim=0)
+            selected_features = torch.cat(selected_features, dim=0)
+            selected_frame_indices = torch.tensor(selected_frame_indices)
+            # ablation
+            max_num_frames = 400  # in case of OOM
+            if len(selected_frames_0) > max_num_frames:
+                interval = len(selected_frames_0) / float(max_num_frames)
+                indices = [int(interval * i) for i in range(max_num_frames)]
+                new_split_sizes.append(len(indices))
+                selected_frames_all_0.append(selected_frames_0[indices])
+                selected_frames_all_1.append(selected_frames_1[indices])
+                selected_frames_feature_all.append(selected_features[indices])
+                selected_frame_indices = selected_frame_indices[indices]
+            else:
+                new_split_sizes.append(len(selected_frames_0))
+                selected_frames_all_0.append(selected_frames_0)
+                selected_frames_all_1.append(selected_frames_1)
+                selected_frames_feature_all.append(selected_features)
+            selected_frame_indices_all.append(selected_frame_indices)
+        selected_frames_all_0 = torch.cat(selected_frames_all_0, dim=0)
+        selected_frames_all_1 = torch.cat(selected_frames_all_1, dim=0)
+        selected_frames_feature_all = torch.cat(selected_frames_feature_all, dim=0)
+        return (
+            selected_frames_feature_all,
+            new_split_sizes,
+            [selected_frames_all_0, selected_frames_all_1],
+            selected_frame_indices_all,
+        )
+    def prepare_inputs_labels_for_multimodal(
+        self,
+        input_ids,
+        position_ids,
+        attention_mask,
+        past_key_values,
+        labels,
+        images,
+        image_aux_attention_masks_list=None,
+        image_sizes=None,
+    ):
+        # vision_tower = self.get_vision_tower()
+        vision_tower_aux_list = self.get_model().get_vision_tower_aux_list()
+        if vision_tower_aux_list is None or images is None or input_ids.shape[1] == 1:
+            return (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                None,
+                labels,
+                None,
+                None,
+                None,
+                None,
+            )
+        image_aux_list = images
+        split_sizes = None
+        if type(image_aux_list[0]) is list or image_aux_list[0].ndim == 5:
+            split_sizes_ori = [
+                1 if image.ndim == 3 else image.shape[0] for image in image_aux_list[0]
+            ]
+            new_image_aux_list = []
+            for image_aux in image_aux_list:
+                if type(image_aux) is list:
+                    image_aux = [
+                        x.unsqueeze(0) if x.ndim == 3 else x for x in image_aux
+                    ]
+                concat_image_aux = torch.cat([image for image in image_aux], dim=0)
+                new_image_aux_list.append(concat_image_aux)
+            image_aux_features_dino = self.encode_images(
+                new_image_aux_list, encode_type="dino"
+            )
+            (
+                image_aux_features_dino,
+                split_sizes,
+                new_image_aux_list,
+                selected_frame_indices_all,
+            ) = self.select_frame(
+                image_aux_features_dino,
+                split_sizes_ori,
+                input_ids,
+                new_image_aux_list,
+                image_sizes,
+                threshold=getattr(self.get_model().config, "dino_threshold", 0.83),
+            )
+            image_aux_features_siglip = self.encode_images(
+                new_image_aux_list, encode_type="siglip"
+            )
+            image_aux_features_list = [
+                image_aux_features_siglip,
+                image_aux_features_dino,
+            ]
+            bs = image_aux_features_list[0].shape[0]
+            dtype = new_image_aux_list[0].dtype
+            frame_sizes = []
+            for i in range(len(image_sizes)):
+                for j in range(split_sizes[i]):
+                    frame_sizes.append(image_sizes[i])
+            image_sizes = frame_sizes
+        else:
+            image_aux_features_list = self.encode_images(image_aux_list)
+            bs = image_aux_list[0].shape[0]
+            dtype = image_aux_list[0].dtype
+        image_token_len = self.get_model().config.image_token_len
+        query_num_list = self.get_model().config.query_num_list
+        final_height = final_width = int(image_token_len**0.5)
+        final_image_features_list = []
+        final_image_features_down_list = []
+        # only needed for sva
+        vision_tower_aux_feature_list_final = None
+        vision_tower_aux_attention_masks_list_final = None
+        global_context_feature_final = None
+        if self.get_model().config.mm_projector_type == "sva":
+            vision_tower_aux_feature_list = []
+            vision_tower_aux_attention_masks_list = []
+            # get vision tokens from each vision tower
+            for aux_i in range(len(vision_tower_aux_list)):
+                image_aux_features = image_aux_features_list[aux_i]
+                image_aux_features = getattr(
+                    self.get_model(), "mm_projector_aux_{}".format(aux_i)
+                )(image_aux_features).to(dtype)
+                if aux_i == 0:
+                    global_context_feature = image_aux_features.mean(1).view(
+                        bs, 1, 1, -1
+                    )
+                vision_tower_aux_feature_list.append(image_aux_features)
+            input_mix_res = True
+            input_high_res = True
+            # perform vision sampling for each query group
+            for query_group_i, query_num in enumerate(query_num_list):
+                query_features_i = (
+                    self.get_model()
+                    .vision_query[query_group_i, :]
+                    .view(1, 1, 1, -1)
+                    .expand(bs, query_num, -1, -1)
+                )
+                global_context_feature_i = global_context_feature.expand(
+                    -1, query_num, 1, -1
+                ).flatten(0, 1)
+                query_side_len = int(query_num**0.5)
+                if IS_XLA_AVAILABLE:
+                    (
+                        vision_tower_aux_feature_list_i,
+                        vision_tower_aux_attention_masks_list_i,
+                    ) = self.rearrange_vision_tower_features_train(
+                        vision_tower_aux_feature_list,
+                        image_aux_attention_masks_list,
+                        query_side_len,
+                    )
+                else:
+                    (
+                        vision_tower_aux_feature_list_i,
+                        vision_tower_aux_attention_masks_list_i,
+                    ) = self.rearrange_vision_tower_features_inference(
+                        vision_tower_aux_feature_list, query_side_len, image_sizes
+                    )
+                query_features_i = getattr(
+                    self.get_model(), "vision_sampler_{}".format(query_group_i)
+                )(
+                    query_features_i.flatten(0, 1),
+                    global_context_feature_i,
+                    *vision_tower_aux_feature_list_i,
+                    *vision_tower_aux_attention_masks_list_i,
+                )
+                query_features_i = query_features_i.view(bs, query_num, -1)
+                if split_sizes is not None:
+                    try:
+                        if "llama" in self.get_model().config.model_type:
+                            text_len = torch.where(input_ids[0] == 128002)[-1][0]
+                        else:
+                            text_len = torch.where(input_ids[0] == 151643)[-1][0]
+                    except:
+                        text_len = len(input_ids[0])
+                    max_visual_len = (
+                        self.get_model().config.tokenizer_model_max_length
+                        - text_len
+                        - getattr(self.get_model().config, "inference_max_length", 16)
+                    )
+                    max_num_frames = max(
+                        1,
+                        math.floor(max_visual_len // (final_height * final_width)),
+                    )
+                    max_num_frames_low = max(
+                        1,
+                        math.floor(
+                            max_visual_len
+                            // (self.get_model().config.lowres_token ** 2)
+                        ),
+                    )
+                    if split_sizes[0] < max_num_frames:
+                        input_mix_res = False
+                    elif split_sizes[0] > max_num_frames_low:
+                        input_mix_res = False
+                        input_high_res = False
+                # input_mix_res = False  # ablation
+                if (getattr(self.config, "highres", False)) and input_mix_res:
+                    _query_features_i = (
+                        query_features_i.permute(0, 2, 1)
+                        .contiguous()
+                        .view(bs, -1, query_side_len, query_side_len)
+                    )
+                    _query_features_i = F.interpolate(
+                        _query_features_i.float(),
+                        size=(
+                            self.get_model().config.lowres_token,
+                            self.get_model().config.lowres_token,
+                        ),
+                        mode="bilinear",
+                        align_corners=False,
+                    ).to(dtype=query_features_i.dtype)
+                    _query_features_i = (
+                        _query_features_i.permute(0, 2, 3, 1).contiguous().flatten(1, 2)
+                    )
+                    final_image_features_down_list.append(_query_features_i)
+                # interpolate to the final target size
+                if query_side_len != final_height:
+                    query_features_i = (
+                        query_features_i.permute(0, 2, 1)
+                        .contiguous()
+                        .view(bs, -1, query_side_len, query_side_len)
+                    )
+                    if input_high_res:
+                        query_features_i = F.interpolate(
+                            query_features_i.float(),
+                            size=(final_height, final_width),
+                            mode="bilinear",
+                            align_corners=False,
+                        ).to(dtype=query_features_i.dtype)
+                    else:
+                        query_features_i = F.interpolate(
+                            query_features_i.float(),
+                            size=(8, 8),
+                            mode="bilinear",
+                            align_corners=False,
+                        ).to(dtype=query_features_i.dtype)
+                    query_features_i = (
+                        query_features_i.permute(0, 2, 3, 1).contiguous().flatten(1, 2)
+                    )
+                final_image_features_list.append(query_features_i)
+            if IS_XLA_AVAILABLE:
+                (
+                    vision_tower_aux_feature_list_final,
+                    vision_tower_aux_attention_masks_list_final,
+                ) = self.rearrange_vision_tower_features_train(
+                    vision_tower_aux_feature_list,
+                    image_aux_attention_masks_list,
+                    final_height,
+                )
+                global_context_feature_final = global_context_feature.expand(
+                    -1, final_height * final_width, 1, -1
+                ).flatten(0, 1)
+        else:
+            final_image_features_list = image_aux_features_list
+        image_features = torch.cat(final_image_features_list, -1)
+        image_features = self.get_model().mm_projector(image_features).to(dtype)
+        if (getattr(self.config, "highres", False)) and input_mix_res:
+            image_features_down = torch.cat(final_image_features_down_list, -1)
+            image_features_down = (
+                self.get_model().mm_projector(image_features_down).to(dtype)
+            )
+        if IS_XLA_AVAILABLE:
+            image_features = image_features.view(
+                image_features.shape[0], final_height, final_width, -1
+            )
+            image_features = torch.cat(
+                (
+                    image_features,
+                    self.model.image_newline[None, None, None, :].expand(
+                        image_features.shape[0], final_height, 1, -1
+                    ),
+                ),
+                dim=2,
+            )
+            image_features = image_features.flatten(1, 2)
+            final_size = [(final_height, final_width)] * bs
+        else:
+            image_features = image_features.view(bs, final_height, final_width, -1)
+            if (getattr(self.config, "highres", False)) and input_mix_res:
+                image_features_down = image_features_down.view(
+                    bs,
+                    self.get_model().config.lowres_token,
+                    self.get_model().config.lowres_token,
+                    -1,
+                )
+            image_features_unpadded = []
+            image_features_downsample = []
+            final_size = []
+            if self.get_model().config.mm_projector_type == "sva":
+                (
+                    vision_tower_aux_feature_list_final,
+                    vision_tower_aux_attention_masks_list_final,
+                ) = self.rearrange_vision_tower_features_inference(
+                    vision_tower_aux_feature_list, final_height, image_sizes, unpad=True
+                )
+                global_context_feature_final = []
+            for batch_i in range(bs):
+                cur_image_feature = image_features[batch_i]
+                image_size = image_sizes[batch_i]
+                cur_image_feature = unpad_image(
+                    cur_image_feature.unsqueeze(0), image_size
+                )
+                cur_h, cur_w = cur_image_feature.shape[1:3]
+                try:  # fix bug for some invalid image
+                    cur_image_feature = cur_image_feature.view(1, cur_h, cur_w, -1)
+                    final_size.append((cur_h, cur_w))
+                except:
+                    # print(f"invalid after unpad {image_features[batch_i].shape}, {image_sizes[batch_i]}", flush=True)
+                    cur_image_feature = image_features[batch_i].unsqueeze(0)
+                    image_size = image_sizes[batch_i]
+                    cur_h, cur_w = cur_image_feature.shape[1:3]
+                    cur_image_feature = cur_image_feature.view(1, cur_h, cur_w, -1)
+                    final_size.append((cur_h, cur_w))
+                if (getattr(self.config, "highres", False)) and input_mix_res:
+                    cur_image_feature_down = unpad_image(
+                        image_features_down[batch_i].unsqueeze(0),
+                        (
+                            int(
+                                image_size[0]
+                                / (
+                                    image_token_len**0.5
+                                    / self.get_model().config.lowres_token
+                                )
+                            ),
+                            int(
+                                image_size[1]
+                                / (
+                                    image_token_len**0.5
+                                    / self.get_model().config.lowres_token
+                                )
+                            ),
+                        ),
+                    )
+                    _cur_h, _cur_w = cur_image_feature_down.shape[1:3]
+                    try:  # fix bug for some invalid image
+                        cur_image_feature_down = cur_image_feature_down.view(
+                            1, _cur_h, _cur_w, -1
+                        )
+                    except:
+                        print("invalid after unpad", flush=True)
+                        cur_image_feature_down = image_features_down[batch_i].unsqueeze(
+                            0
+                        )
+                        _cur_h, _cur_w = cur_image_feature_down.shape[1:3]
+                        cur_image_feature_down = cur_image_feature_down.view(
+                            1, _cur_h, _cur_w, -1
+                        )
+                    cur_image_feature_down = torch.cat(
+                        (
+                            cur_image_feature_down,
+                            self.model.image_newline.view(1, 1, 1, -1)
+                            .expand(1, _cur_h, 1, -1)
+                            .to(cur_image_feature_down.device),
+                        ),
+                        dim=2,
+                    ).flatten(1, 2)
+                    if split_sizes is None and getattr(self.config, "frame_pos", False):
+                        frame_pos = (
+                            self.get_model()
+                            .get_frame_pos(torch.arange(1))
+                            .to(cur_image_feature_down.device)
+                            .to(cur_image_feature_down.dtype)
+                        )
+                        cur_image_feature_down += frame_pos
+                    image_features_downsample.append(cur_image_feature_down.squeeze(0))
+                cur_image_feature = torch.cat(
+                    (
+                        cur_image_feature,
+                        self.model.image_newline.view(1, 1, 1, -1)
+                        .expand(1, cur_h, 1, -1)
+                        .to(cur_image_feature.device),
+                    ),
+                    dim=2,
+                )
+                if split_sizes is None and getattr(self.config, "frame_pos", False):
+                    frame_pos = (
+                        self.get_model()
+                        .get_frame_pos(torch.arange(1))
+                        .to(cur_image_feature.device)
+                        .to(cur_image_feature.dtype)
+                    )
+                    cur_image_feature += frame_pos
+                cur_image_feature = cur_image_feature.flatten(1, 2)
+                image_features_unpadded.append(cur_image_feature.squeeze(0))
+                if self.get_model().config.mm_projector_type == "sva":
+                    cur_global_context_feature = global_context_feature[batch_i].expand(
+                        cur_h * cur_w, 1, -1
+                    )
+                    global_context_feature_final.append(cur_global_context_feature)
+            if self.get_model().config.mm_projector_type == "sva":
+                global_context_feature_final = torch.cat(
+                    global_context_feature_final, 0
+                )
+            if (getattr(self.config, "highres", False)) and input_mix_res:
+                image_features = image_features_downsample
+            else:
+                image_features = image_features_unpadded
+        # TODO: image start / end is not implemented here to support pretraining.
+        if getattr(self.config, "tune_mm_mlp_adapter", False) and getattr(
+            self.config, "mm_use_im_start_end", False
+        ):
+            raise NotImplementedError
+        split_image_features_unpadded = None
+        frame_split_sizes = None
+        if split_sizes is not None:
+            split_image_features = []
+            split_image_features_unpadded = (
+                []
+                if (getattr(self.config, "highres", False)) and input_mix_res
+                else None
+            )
+            start_idx = 0
+            for split_batch_idx, split_size in enumerate(split_sizes):
+                if isinstance(image_features[start_idx : start_idx + split_size], list):
+                    if getattr(self.config, "frame_pos", False):
+                        frame_feature = torch.cat(
+                            image_features[start_idx : start_idx + split_size], dim=0
+                        ).reshape(split_size, -1, image_features[0].shape[-1])
+                        frame_pos = (
+                            self.get_model()
+                            .get_frame_pos(selected_frame_indices_all[split_batch_idx])
+                            .to(frame_feature.device)
+                            .to(frame_feature.dtype)
+                        )
+                        frame_feature += frame_pos
+                        split_image_features.append(
+                            frame_feature.reshape(-1, image_features[0].shape[-1])
+                        )
+                    else:
+                        split_image_features.append(
+                            torch.cat(
+                                image_features[start_idx : start_idx + split_size],
+                                dim=0,
+                            )
+                        )
+                    if (getattr(self.config, "highres", False)) and input_mix_res:
+                        if getattr(self.config, "frame_pos", False):
+                            frame_feature = torch.cat(
+                                image_features_unpadded[
+                                    start_idx : start_idx + split_size
+                                ],
+                                dim=0,
+                            ).reshape(split_size, -1, image_features[0].shape[-1])
+                            frame_pos = (
+                                self.get_model()
+                                .get_frame_pos(
+                                    selected_frame_indices_all[split_batch_idx]
+                                )
+                                .to(frame_feature.device)
+                                .to(frame_feature.dtype)
+                            )
+                            frame_feature += frame_pos
+                            split_image_features_unpadded.append(
+                                frame_feature.reshape(-1, image_features[0].shape[-1])
+                            )
+                        else:
+                            split_image_features_unpadded.append(
+                                torch.cat(
+                                    image_features_unpadded[
+                                        start_idx : start_idx + split_size
+                                    ],
+                                    dim=0,
+                                )
+                            )
+                else:
+                    if getattr(self.config, "frame_pos", False):
+                        frame_feature = image_features[
+                            start_idx : start_idx + split_size
+                        ].reshape(split_size, -1, image_features[0].shape[-1])
+                        frame_pos = (
+                            self.get_model()
+                            .get_frame_pos(selected_frame_indices_all[split_batch_idx])
+                            .to(frame_feature.device)
+                            .to(frame_feature.dtype)
+                        )
+                        frame_feature += frame_pos
+                        split_image_features.append(
+                            frame_feature.reshape(-1, image_features[0].shape[-1])
+                        )
+                    else:
+                        split_image_features.append(
+                            image_features[start_idx : start_idx + split_size]
+                        )
+                    if (getattr(self.config, "highres", False)) and input_mix_res:
+                        if getattr(self.config, "frame_pos", False):
+                            frame_feature = image_features_unpadded[
+                                start_idx : start_idx + split_size
+                            ]
+                            frame_pos = (
+                                self.get_model()
+                                .get_frame_pos(
+                                    selected_frame_indices_all[split_batch_idx]
+                                )
+                                .to(frame_feature.device)
+                                .to(frame_feature.dtype)
+                            )
+                            frame_feature += frame_pos
+                            split_image_features_unpadded.append(
+                                frame_feature.reshape(-1, image_features[0].shape[-1])
+                            )
+                        else:
+                            split_image_features_unpadded.append(
+                                image_features_unpadded[
+                                    start_idx : start_idx + split_size
+                                ]
+                            )
+                start_idx += split_size
+            image_features = split_image_features
+            frame_split_sizes = split_sizes
+        _labels = labels
+        _position_ids = position_ids
+        _attention_mask = attention_mask
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+        else:
+            attention_mask = attention_mask.bool()
+        if position_ids is None:
+            position_ids = torch.arange(
+                0, input_ids.shape[1], dtype=torch.long, device=input_ids.device
+            )
+        if labels is None:
+            labels = torch.full_like(input_ids, IGNORE_INDEX)
+        # remove the padding using attention_mask -- FIXME
+        _input_ids = input_ids
+        attention_mask = attention_mask | (input_ids == IMAGE_TOKEN_INDEX)
+        input_ids = [
+            cur_input_ids[cur_attention_mask]
+            for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)
+        ]
+        labels = [
+            cur_labels[cur_attention_mask]
+            for cur_labels, cur_attention_mask in zip(labels, attention_mask)
+        ]
+        new_input_embeds = []
+        new_labels = []
+        image_token_indices_batch = []
+        cur_image_idx = 0
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
+            if num_images == 0:
+                cur_image_features = image_features[cur_image_idx]
+                cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids)
+                cur_input_embeds = torch.cat(
+                    [cur_input_embeds_1, cur_image_features[0:0]], dim=0
+                )
+                new_input_embeds.append(cur_input_embeds)
+                new_labels.append(labels[batch_idx])
+                cur_image_idx += 1
+                continue
+            image_token_indices = (
+                [-1]
+                + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist()
+                + [cur_input_ids.shape[0]]
+            )
+            image_token_indices_batch.append(
+                torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist()[0]
+            )
+            cur_input_ids_noim = []
+            cur_labels = labels[batch_idx]
+            cur_labels_noim = []
+            for i in range(len(image_token_indices) - 1):
+                cur_input_ids_noim.append(
+                    cur_input_ids[
+                        image_token_indices[i] + 1 : image_token_indices[i + 1]
+                    ]
+                )
+                cur_labels_noim.append(
+                    cur_labels[image_token_indices[i] + 1 : image_token_indices[i + 1]]
+                )
+            split_sizes = [x.shape[0] for x in cur_labels_noim]
+            cur_input_embeds = self.get_model().embed_tokens(
+                torch.cat(cur_input_ids_noim)
+            )
+            cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
+            cur_new_input_embeds = []
+            cur_new_labels = []
+            text_len = sum([x.shape[0] for x in cur_input_embeds_no_im])
+            visual_len = len(image_features[cur_image_idx])
+            max_visual_len = (
+                self.get_model().config.tokenizer_model_max_length
+                - getattr(self.get_model().config, "inference_max_length", 16)
+                - text_len
+            )
+            mix_token = False
+            # ablation mix
+            if (
+                input_mix_res
+                and (
+                    self.get_model().config.image_token_len
+                    > getattr(self.get_model().config, "lowres_token", 8) ** 2
+                )
+                and frame_split_sizes is not None
+                and getattr(self.config, "highres", False)
+            ):
+                if max_visual_len > visual_len:
+                    visual_emb = image_features[cur_image_idx]
+                    text_emb = cur_input_embeds_no_im[-1]
+                    highres_num = math.floor(
+                        (max_visual_len - visual_len)
+                        / (
+                            split_image_features_unpadded[cur_image_idx].shape[0]
+                            // frame_split_sizes[cur_image_idx]
+                            - visual_emb.shape[0] // frame_split_sizes[cur_image_idx]
+                        )
+                    )
+                    if highres_num >= 1:
+                        mix_token = True
+                        sim = torch.matmul(visual_emb, text_emb.transpose(0, 1)).mean(
+                            dim=-1
+                        )
+                        sim_frame = sim.reshape(
+                            frame_split_sizes[cur_image_idx], -1
+                        ).mean(dim=-1)
+                        highres_num = min(highres_num, sim_frame.shape[0])
+                        top_values, top_indices = torch.topk(sim_frame, highres_num)
+                        if len(top_indices) > 0:
+                            sorted_indices = torch.sort(top_indices)[1]
+                            top_indices = top_indices[sorted_indices]
+                            visual_emb_frame = image_features[cur_image_idx].reshape(
+                                frame_split_sizes[cur_image_idx],
+                                -1,
+                                image_features[cur_image_idx].shape[-1],
+                            )
+                            visual_emb_frame_highres = split_image_features_unpadded[
+                                cur_image_idx
+                            ].reshape(
+                                frame_split_sizes[cur_image_idx],
+                                -1,
+                                split_image_features_unpadded[cur_image_idx].shape[-1],
+                            )
+                            current_point = 0
+                            mix_visual_emb_frame = []
+                            for frame_i in range(len(visual_emb_frame)):
+                                if current_point > len(top_indices) - 1:
+                                    mix_visual_emb_frame.append(
+                                        visual_emb_frame[frame_i]
+                                    )
+                                    continue
+                                if frame_i == top_indices[current_point]:
+                                    mix_visual_emb_frame.append(
+                                        visual_emb_frame_highres[frame_i]
+                                    )
+                                    current_point += 1
+                                else:
+                                    mix_visual_emb_frame.append(
+                                        visual_emb_frame[frame_i]
+                                    )
+                            image_features[cur_image_idx] = torch.cat(
+                                mix_visual_emb_frame, dim=0
+                            )
+            # ablation drop
+            if (
+                max_visual_len < visual_len
+                and frame_split_sizes is not None
+                and not mix_token
+            ):
+                visual_emb_frame = image_features[cur_image_idx].reshape(
+                    frame_split_sizes[cur_image_idx],
+                    -1,
+                    image_features[cur_image_idx].shape[-1],
+                )
+                sim = F.cosine_similarity(
+                    visual_emb_frame[:-1],
+                    visual_emb_frame[1:],
+                    dim=-1,
+                )
+                new_visual_emb_frames = []
+                for start_idx in range(0, len(visual_emb_frame), 8):
+                    end_idx = min(start_idx + 8, len(visual_emb_frame))
+                    chunk_feature = visual_emb_frame[start_idx:end_idx]  # 8, HW, C
+                    if len(chunk_feature) == 1:
+                        new_visual_emb_frames.append(chunk_feature[0])
+                        continue
+                    sim = F.cosine_similarity(
+                        chunk_feature[0]
+                        .unsqueeze(0)
+                        .repeat_interleave(len(chunk_feature[1:]), dim=0),
+                        chunk_feature[1:],
+                        dim=-1,
+                    )
+                    new_visual_emb_frame = torch.cat(
+                        [
+                            chunk_feature[0],
+                            chunk_feature[1:].flatten(0, 1)[
+                                sim.flatten(0, 1)
+                                < getattr(
+                                    self.get_model().config, "drop_threshold", 0.7
+                                )
+                            ],
+                        ],
+                        dim=0,
+                    )
+                    new_visual_emb_frames.append(new_visual_emb_frame)
+                reduced_visual_len = sum([x.shape[0] for x in new_visual_emb_frames])
+                if reduced_visual_len > max_visual_len:
+                    force_remove = math.ceil(
+                        (reduced_visual_len - max_visual_len)
+                        / len(new_visual_emb_frames)
+                    )
+                    for chunk_i in range(len(new_visual_emb_frames)):
+                        new_visual_emb_frames[chunk_i] = new_visual_emb_frames[chunk_i][
+                            :-force_remove
+                        ]
+                    new_visual_emb_frames = torch.cat(new_visual_emb_frames, dim=0)
+                else:
+                    new_visual_emb_frames = torch.cat(new_visual_emb_frames, dim=0)
+                image_features[cur_image_idx] = new_visual_emb_frames[:max_visual_len]
+            for i in range(num_images + 1):
+                cur_new_input_embeds.append(cur_input_embeds_no_im[i])
+                cur_new_labels.append(cur_labels_noim[i])
+                if i < num_images:
+                    cur_image_features = image_features[cur_image_idx]
+                    cur_image_idx += 1
+                    cur_new_input_embeds.append(cur_image_features)
+                    cur_new_labels.append(
+                        torch.full(
+                            (cur_image_features.shape[0],),
+                            IGNORE_INDEX,
+                            device=cur_labels.device,
+                            dtype=cur_labels.dtype,
+                        )
+                    )
+            cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds]
+            cur_new_input_embeds = torch.cat(cur_new_input_embeds)
+            cur_new_labels = torch.cat(cur_new_labels)
+            new_input_embeds.append(cur_new_input_embeds)
+            new_labels.append(cur_new_labels)
+        # Truncate sequences to max length as image embeddings can make the sequence longer
+        tokenizer_model_max_length = getattr(
+            self.config, "tokenizer_model_max_length", None
+        )
+        if tokenizer_model_max_length is not None:
+            new_input_embeds = [
+                x[:tokenizer_model_max_length] for x in new_input_embeds
+            ]
+            new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
+        # Combine them
+        max_len = max(x.shape[0] for x in new_input_embeds)
+        batch_size = len(new_input_embeds)
+        new_input_embeds_padded = []
+        new_labels_padded = torch.full(
+            (batch_size, max_len),
+            IGNORE_INDEX,
+            dtype=new_labels[0].dtype,
+            device=new_labels[0].device,
+        )
+        attention_mask = torch.zeros(
+            (batch_size, max_len),
+            dtype=attention_mask.dtype,
+            device=attention_mask.device,
+        )
+        position_ids = torch.zeros(
+            (batch_size, max_len),
+            dtype=position_ids.dtype,
+            device=position_ids.device,
+        )
+        for i, (cur_new_embed, cur_new_labels) in enumerate(
+            zip(new_input_embeds, new_labels)
+        ):
+            cur_len = cur_new_embed.shape[0]
+            if getattr(self.config, "tokenizer_padding_side", "right") == "left":
+                new_input_embeds_padded.append(
+                    torch.cat(
+                        (
+                            torch.zeros(
+                                (max_len - cur_len, cur_new_embed.shape[1]),
+                                dtype=cur_new_embed.dtype,
+                                device=cur_new_embed.device,
+                            ),
+                            cur_new_embed,
+                        ),
+                        dim=0,
+                    )
+                )
+                if cur_len > 0:
+                    new_labels_padded[i, -cur_len:] = cur_new_labels
+                    attention_mask[i, -cur_len:] = True
+                    position_ids[i, -cur_len:] = torch.arange(
+                        0,
+                        cur_len,
+                        dtype=position_ids.dtype,
+                        device=position_ids.device,
+                    )
+            else:
+                new_input_embeds_padded.append(
+                    torch.cat(
+                        (
+                            cur_new_embed,
+                            torch.zeros(
+                                (max_len - cur_len, cur_new_embed.shape[1]),
+                                dtype=cur_new_embed.dtype,
+                                device=cur_new_embed.device,
+                            ),
+                        ),
+                        dim=0,
+                    )
+                )
+                if cur_len > 0:
+                    new_labels_padded[i, :cur_len] = cur_new_labels
+                    attention_mask[i, :cur_len] = True
+                    position_ids[i, :cur_len] = torch.arange(
+                        0,
+                        cur_len,
+                        dtype=position_ids.dtype,
+                        device=position_ids.device,
+                    )
+        new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
+        if _labels is None:
+            new_labels = None
+        else:
+            new_labels = new_labels_padded
+        if _attention_mask is None:
+            attention_mask = None
+        else:
+            attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
+        if _position_ids is None:
+            position_ids = None
+        return (
+            None,
+            position_ids,
+            attention_mask,
+            past_key_values,
+            new_input_embeds,
+            new_labels,
+            vision_tower_aux_feature_list_final,
+            vision_tower_aux_attention_masks_list_final,
+            final_size,
+            global_context_feature_final,
+        )
+    def initialize_vision_tokenizer(self, model_args, tokenizer):
+        if model_args.mm_use_im_patch_token:
+            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+        if model_args.mm_use_im_start_end:
+            num_new_tokens = tokenizer.add_tokens(
+                [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True
+            )
+            self.resize_token_embeddings(len(tokenizer))
+            if num_new_tokens > 0:
+                input_embeddings = self.get_input_embeddings().weight.data
+                output_embeddings = self.get_output_embeddings().weight.data
+                input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True
+                )
+                output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True
+                )
+                input_embeddings[-num_new_tokens:] = input_embeddings_avg
+                output_embeddings[-num_new_tokens:] = output_embeddings_avg
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = True
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False
+            if model_args.pretrain_mm_mlp_adapter:
+                mm_projector_weights = torch.load(
+                    model_args.pretrain_mm_mlp_adapter, map_location="cpu"
+                )
+                embed_tokens_weight = mm_projector_weights["model.embed_tokens.weight"]
+                assert num_new_tokens == 2
+                if input_embeddings.shape == embed_tokens_weight.shape:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight[
+                        -num_new_tokens:
+                    ]
+                elif embed_tokens_weight.shape[0] == num_new_tokens:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight
+                else:
+                    raise ValueError(
+                        f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}."
+                    )
+        elif model_args.mm_use_im_patch_token:
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = False
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False

config.json ADDED Viewed

	@@ -0,0 +1,88 @@

+{
+  "_name_or_path": "jadechoghari/LongVU_Qwen2_7B",
+  "architectures": [
+    "CambrianQwenForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "modeling.CambrianConfig",
+    "AutoModel": "modeling.CambrianLlamaForCausalLM",
+    "AutoModelForCausalLM": "modeling.CambrianLlamaForCausalLM"
+  },
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "connect_layer": 2,
+  "connector_depth": 3,
+  "connector_only": true,
+  "dino_threshold": 0.83,
+  "drop_threshold": 0.7,
+  "eos_token_id": 151645,
+  "frame_pos": false,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "highres": true,
+  "highres_connect": false,
+  "image_aspect_ratio": "pad",
+  "image_position": 91,
+  "image_token_len": 144,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "is_st_sampler": false,
+  "lowres_token": 8,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "mm_patch_merge_type": "flat",
+  "mm_projector_lr": null,
+  "mm_projector_type": "sva",
+  "mm_use_im_patch_token": false,
+  "mm_use_im_start_end": false,
+  "mm_vision_sampler_lr": null,
+  "mm_vision_select_feature": "patch",
+  "mm_vision_select_layer": -2,
+  "mm_vision_tower_aux_list": [
+    "siglip/CLIP-ViT-SO400M-14-384",
+    "facebook/dinov2-giant-res378"
+  ],
+  "mm_vision_tower_aux_token_len_list": [
+    576,
+    576
+  ],
+  "mm_vision_tower_lr": null,
+  "model_type": "cambrian_qwen",
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "num_of_vision_sampler_layers": 10,
+  "num_query_group": 1,
+  "pretraining_tp": 1,
+  "query_num_list": [
+    144
+  ],
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "spmd_debug": null,
+  "spmd_fsdp_sharding": null,
+  "spmd_mesh": null,
+  "start_of_vision_sampler_layers": 0,
+  "stride_of_vision_sampler_layers": 3,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 10000,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float32",
+  "transformers_version": "4.44.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_mm_vision_tower": false,
+  "use_cache": false,
+  "use_mm_proj": true,
+  "use_pos_skipping": false,
+  "use_sliding_window": false,
+  "vision_hidden_size": 1024,
+  "vision_tower_aux_token_len_list": [
+    576,
+    576
+  ],
+  "vocab_size": 152064
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e846f373072ab8e42ee7963e21514d543696ee2859c30570bb1b05a88d94f3ca
+size 15343381968

modeling.py ADDED Viewed

	@@ -0,0 +1,471 @@

+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import CrossEntropyLoss
+from transformers import AutoConfig, AutoModelForCausalLM
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.generation.utils import GenerateOutput
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.utils import logging
+from .cambrian_arch import CambrianMetaForCausalLM, CambrianMetaModel
+IS_XLA_AVAILABLE = False
+from transformers import Qwen2Config, Qwen2ForCausalLM, Qwen2Model
+logger = logging.get_logger(__name__)
+class CambrianConfig(Qwen2Config):
+    model_type = "cambrian_qwen"
+    debug = "debug"
+class CambrianQwenModel(CambrianMetaModel, Qwen2Model):
+    config_class = CambrianConfig
+    def __init__(self, config: Qwen2Config):
+        super(CambrianQwenModel, self).__init__(config)
+    def forward(
+        self,
+        # pyre-fixme[9]: input_ids has type `LongTensor`; used as `None`.
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        vision_tower_aux_feature_list: Optional[List[torch.FloatTensor]] = None,
+        vision_tower_aux_attention_masks_list: Optional[List[torch.Tensor]] = None,
+        final_vision_feature_size: Optional[List[tuple]] = None,
+        global_context_feature: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            # pyre-fixme[16]: `CambrianQwenModel` has no attribute `config`.
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+        # pyre-fixme[16]: `CambrianQwenModel` has no attribute `gradient_checkpointing`.
+        # pyre-fixme[16]: `CambrianQwenModel` has no attribute `training`.
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        use_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            use_legacy_cache = True
+            # pyre-fixme[6]: For 1st argument expected
+            #  `Optional[Tuple[Tuple[FloatTensor]]]` but got
+            #  `Optional[List[FloatTensor]]`.
+            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            logger.warning_once(
+                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
+            )
+        if inputs_embeds is None:
+            # pyre-fixme[16]: `CambrianQwenModel` has no attribute `embed_tokens`.
+            inputs_embeds = self.embed_tokens(input_ids)
+        if cache_position is None:
+            past_seen_tokens = (
+                # pyre-fixme[16]: Item `List` of `Union[List[torch._C.FloatTensor],
+                #  DynamicCache]` has no attribute `get_seq_length`.
+                past_key_values.get_seq_length() if past_key_values is not None else 0
+            )
+            cache_position = torch.arange(
+                past_seen_tokens,
+                past_seen_tokens + inputs_embeds.shape[1],
+                device=inputs_embeds.device,
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        # pyre-fixme[16]: `CambrianQwenModel` has no attribute `_update_causal_mask`.
+        causal_mask = self._update_causal_mask(
+            attention_mask,
+            inputs_embeds,
+            cache_position,
+            past_key_values,
+            output_attentions,
+        )
+        hidden_states = inputs_embeds
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        # pyre-fixme[16]: `CambrianQwenModel` has no attribute `layers`.
+        for i, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                # pyre-fixme[16]: `CambrianQwenModel` has no attribute
+                #  `_gradient_checkpointing_func`.
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        # pyre-fixme[16]: `CambrianQwenModel` has no attribute `norm`.
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = None
+        if use_cache:
+            next_cache = (
+                next_decoder_cache.to_legacy_cache()
+                if use_legacy_cache
+                else next_decoder_cache
+            )
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
+                if v is not None
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class CambrianQwenForCausalLM(Qwen2ForCausalLM, CambrianMetaForCausalLM):
+    config_class = CambrianConfig
+    def __init__(self, config):
+        # super(Qwen2ForCausalLM, self).__init__(config)
+        Qwen2ForCausalLM.__init__(self, config)
+        config.model_type = "cambrian_qwen"
+        config.rope_scaling = None
+        self.model = CambrianQwenModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_model(self):
+        return self.model
+    def forward(
+        self,
+        # pyre-fixme[9]: input_ids has type `LongTensor`; used as `None`.
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        image_aux_attention_masks_list: Optional[List[torch.Tensor]] = None,
+        image_sizes: Optional[List[List[int]]] = None,
+        return_dict: Optional[bool] = None,
+        modalities: Optional[List[str]] = ["image"],
+        dpo_forward: Optional[bool] = False,
+        cache_position=None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        input_image_features = None
+        highres_image_features = None
+        frame_split_sizes = None
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels,
+                vision_tower_aux_feature_list,
+                vision_tower_aux_attention_masks_list,
+                final_vision_feature_size,
+                global_context_feature,
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images,
+                image_aux_attention_masks_list,
+                image_sizes,
+            )
+        if dpo_forward:
+            # pyre-fixme[29]: `CambrianQwenModel` is not a function.
+            outputs = self.model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            hidden_states = outputs[0]
+            logits = self.lm_head(hidden_states)
+            return logits, labels
+        else:
+            if hasattr(self, "vision_tower_aux_feature_list"):
+                # pyre-fixme[29]: `CambrianQwenModel` is not a function.
+                outputs = self.model(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_values=past_key_values,
+                    inputs_embeds=inputs_embeds,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                    output_hidden_states=output_hidden_states,
+                    return_dict=return_dict,
+                    vision_tower_aux_feature_list=(
+                        # pyre-fixme[61]: `vision_tower_aux_feature_list` is
+                        #  undefined, or not always defined.
+                        vision_tower_aux_feature_list
+                        if inputs_embeds is None
+                        # pyre-fixme[16]: `CambrianQwenForCausalLM` has no attribute
+                        #  `vision_tower_aux_feature_list`.
+                        else self.vision_tower_aux_feature_list
+                    ),
+                    vision_tower_aux_attention_masks_list=(
+                        # pyre-fixme[61]: `vision_tower_aux_attention_masks_list` is
+                        #  undefined, or not always defined.
+                        vision_tower_aux_attention_masks_list
+                        if inputs_embeds is None
+                        # pyre-fixme[16]: `CambrianQwenForCausalLM` has no attribute
+                        #  `vision_tower_aux_attention_masks_list`.
+                        else self.vision_tower_aux_attention_masks_list
+                    ),
+                    final_vision_feature_size=(
+                        # pyre-fixme[61]: `final_vision_feature_size` is undefined,
+                        #  or not always defined.
+                        final_vision_feature_size
+                        if inputs_embeds is None
+                        # pyre-fixme[16]: `CambrianQwenForCausalLM` has no attribute
+                        #  `final_vision_feature_size`.
+                        else self.final_vision_feature_size
+                    ),
+                    global_context_feature=(
+                        # pyre-fixme[61]: `global_context_feature` is undefined, or
+                        #  not always defined.
+                        global_context_feature
+                        if inputs_embeds is None
+                        # pyre-fixme[16]: `CambrianQwenForCausalLM` has no attribute
+                        #  `global_context_feature`.
+                        else self.global_context_feature
+                    ),
+                )
+            else:
+                # pyre-fixme[29]: `CambrianQwenModel` is not a function.
+                outputs = self.model(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_values=past_key_values,
+                    inputs_embeds=inputs_embeds,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                    output_hidden_states=output_hidden_states,
+                    return_dict=return_dict,
+                    # final_vision_feature_size=final_vision_feature_size,
+                )
+            hidden_states = outputs[0]
+            logits = self.lm_head(hidden_states)
+            logits = logits.float()
+            loss = None
+            if labels is not None:
+                # Shift so that tokens < n predict n
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+                # Flatten the tokens
+                loss_fct = CrossEntropyLoss()
+                # pyre-fixme[16]: `CambrianQwenForCausalLM` has no attribute `config`.
+                shift_logits = shift_logits.view(-1, self.config.vocab_size)
+                shift_labels = shift_labels.view(-1)
+                # Enable model parallelism
+                shift_labels = shift_labels.to(shift_logits.device)
+                loss = loss_fct(shift_logits, shift_labels)
+            if not return_dict:
+                output = (logits,) + outputs[1:]
+                return (loss,) + output if loss is not None else output
+            return CausalLMOutputWithPast(
+                loss=loss,
+                logits=logits,
+                past_key_values=outputs.past_key_values,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            )
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+        if images is not None:
+            (
+                inputs,
+                position_ids,
+                attention_mask,
+                _,
+                inputs_embeds,
+                _,
+                vision_tower_aux_feature_list,
+                vision_tower_aux_attention_masks_list,
+                final_vision_feature_size,
+                global_context_feature,
+            ) = self.prepare_inputs_labels_for_multimodal(
+                inputs,
+                position_ids,
+                attention_mask,
+                None,
+                None,
+                images,
+                image_sizes=image_sizes,
+            )
+            # pyre-fixme[16]: `CambrianQwenForCausalLM` has no attribute
+            #  `vision_tower_aux_feature_list`.
+            self.vision_tower_aux_feature_list = vision_tower_aux_feature_list
+            # pyre-fixme[16]: `CambrianQwenForCausalLM` has no attribute
+            #  `vision_tower_aux_attention_masks_list`.
+            self.vision_tower_aux_attention_masks_list = (
+                vision_tower_aux_attention_masks_list
+            )
+            # pyre-fixme[16]: `CambrianQwenForCausalLM` has no attribute
+            #  `final_vision_feature_size`.
+            self.final_vision_feature_size = final_vision_feature_size
+            # pyre-fixme[16]: `CambrianQwenForCausalLM` has no attribute
+            #  `global_context_feature`.
+            self.global_context_feature = global_context_feature
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+        # pyre-fixme[16]: `Qwen2ForCausalLM` has no attribute `generate`.
+        return super().generate(
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs,
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs
+    ):
+        images = kwargs.pop("images", None)
+        image_sizes = kwargs.pop("image_sizes", None)
+        inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            **kwargs,
+        )
+        if images is not None:
+            inputs["images"] = images
+        if image_sizes is not None:
+            inputs["image_sizes"] = image_sizes
+        return inputs
+AutoConfig.register("cambrian_qwen", CambrianConfig)
+AutoModelForCausalLM.register(CambrianConfig, CambrianQwenForCausalLM)

multimodal_encoder_builder.py ADDED Viewed

	@@ -0,0 +1,368 @@

+# pyre-unsafe
+import copy
+import torch
+import torch.nn.functional as F
+from transformers import AutoImageProcessor, Dinov2Config, Dinov2Model, SiglipImageProcessor, SiglipVisionConfig, SiglipVisionModel
+from abc import ABC, abstractmethod
+import torch.nn as nn
+class ProcessorWrapper:
+    def __init__(
+        self,
+        transform,
+        height=378,
+        width=378,
+        image_mean=[0.48145466, 0.4578275, 0.40821073],
+    ):
+        self._crop_size = {
+            "height": height,
+            "width": width,
+        }
+        self._transforms = transform
+        # print(transform)
+        self.image_mean = image_mean
+    @property
+    def crop_size(self):
+        return self._crop_size
+    def preprocess(self, image, return_tensors="pt"):
+        # Ensure image is a PIL Image
+        output = {}
+        output["pixel_values"] = [self._transforms(image)]
+        return output
+class BaseVisionTower(nn.Module):
+    def __init__(self, vision_tower_name, args, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.args = args
+        self.vision_tower_name = vision_tower_name
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, "mm_vision_select_feature", "patch")
+        self.unfreeze_mm_vision_tower = getattr(args, "unfreeze_mm_vision_tower", False)
+        self.delay_load = delay_load
+    @abstractmethod
+    def load_model(self, device_map=None):
+        raise NotImplementedError("Subclasses must implement load_model")
+    @abstractmethod
+    def _forward(self, images):
+        raise NotImplementedError("Subclasses must implement forward")
+    def forward(self, images):
+        if type(images) is list:
+            image_features = [self._forward(image.unsqueeze(0)) for image in images]
+        else:
+            image_features = self._forward(images)
+        return image_features
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        # Dynamically infer the dtype from the first parameter, if not explicitly specified
+        if hasattr(self.vision_tower, "dtype"):
+            return self.vision_tower.dtype
+        else:
+            params = list(self.vision_tower.parameters())
+            return (
+                params[0].dtype if len(params) > 0 else torch.float32
+            )  # Default to torch.float32 if no parameters
+    @property
+    def device(self):
+        # Dynamically infer the device from the first parameter, if not explicitly specified
+        if hasattr(self.vision_tower, "device"):
+            return self.vision_tower.device
+        else:
+            params = list(self.vision_tower.parameters())
+            return (
+                params[0].device if len(params) > 0 else torch.device("cpu")
+            )  # Default to CPU if no parameters
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+    @property
+    def hidden_size(self):
+        try:
+            return self.config.hidden_size
+        except:
+            return self._hidden_size
+    @property
+    def image_size(self):  # resolution
+        # return self.config.image_size
+        try:
+            return self.config.image_size
+        except:
+            return self._image_size
+    @property
+    def patch_size(self):
+        # return self.config.patch_size
+        try:
+            return self.config.patch_size
+        except:
+            return self._patch_size
+    @property
+    def num_patches_per_side(self):
+        if self._interp_size is not None:
+            return int(self._interp_size**0.5)
+        try:
+            return self.image_size // self.patch_size
+        except:
+            return self._num_patches_per_side
+    @property
+    def num_patches(self):
+        if self._interp_size is not None:
+            return self._interp_size
+        try:
+            return self.num_patches_per_side**2
+        except:
+            return self._num_patches
+class DinoVisionTower(BaseVisionTower):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super(DinoVisionTower, self).__init__(vision_tower, args, delay_load)
+        model_path = "facebook/dinov2-giant"
+        base_model_name, res, interp = model_path, 378, 576
+        self._vision_tower_name = vision_tower
+        self.vision_tower_name = base_model_name
+        self._image_size = res
+        self._interp_size = interp
+        self._patch_size = 14  # default patch size
+        if not self.delay_load:
+            self.load_model()
+        else:
+            self.cfg_only = Dinov2Config.from_pretrained(self.vision_tower_name)
+    def load_model(self, device_map=None):
+        self.vision_tower = Dinov2Model.from_pretrained(self.vision_tower_name)
+        """ValueError: Dinov2Model does not support `device_map='auto'`. To implement support, the model class needs to implement the `_no_split_modules` attribute."""
+        self.vision_tower._no_split_modules = ["Dinov2SwiGLUFFN"]
+        _image_size = self.vision_tower.config.image_size
+        if self._image_size is None:
+            self._image_size = _image_size
+        # increase shortest edge to prevent edge case crops
+        default_shortest_ratio = 8 / 7  # 224/256
+        # shortest_edge = int(default_shortest_ratio * self._image_size)
+        shortest_edge = self._image_size
+        processor = AutoImageProcessor.from_pretrained(
+            self.vision_tower_name,
+            crop_size=dict(height=self._image_size, width=self._image_size),
+            size=dict(shortest_edge=shortest_edge),
+        )
+        self.image_processor = processor
+        # Assign the output channels of the projection convolution as the hidden size
+        self._hidden_size = (
+            self.vision_tower.embeddings.patch_embeddings.projection.out_channels
+        )
+        # Assign the first value of the stride of the projection convolution as the patch size
+        self._patch_size = (
+            self.vision_tower.embeddings.patch_embeddings.projection.stride[0]
+        )
+        # print(self._hidden_size, self._patch_size)
+        self.vision_tower.requires_grad_(self.unfreeze_mm_vision_tower)
+        self.is_loaded = True
+    @property
+    def image_size(self):
+        return self._image_size
+    def feature_select(self, outputs):
+        sequence_output = outputs[
+            "last_hidden_state"
+        ]  # batch_size, sequence_length, hidden_size
+        if self.select_feature == "cls_patch":
+            image_features = sequence_output
+        elif self.select_feature == "patch":
+            image_features = sequence_output[:, 1:]
+        elif self.select_feature == "cls":
+            image_features = sequence_output[:, 0]
+        else:
+            raise ValueError(f"Unexpected select feature: {self.select_feature}")
+        return image_features
+    def interpolate(self, image_features):
+        if self._interp_size is None:
+            return image_features
+        b, num_tokens, dim = image_features.shape
+        if num_tokens != self.num_patches:
+            target_h = target_w = int(self._interp_size**0.5)
+            h = w = int(num_tokens**0.5)
+            image_features = image_features.view(b, h, w, dim)
+            image_features = image_features.permute(0, 3, 1, 2).contiguous()
+            image_features = F.interpolate(
+                image_features.to(torch.float32),
+                size=(target_h, target_w),
+                mode="bilinear",
+                align_corners=False,
+            ).to(image_features.dtype)
+            # Permute the dimensions back to (b, target_h, target_w, dim)
+            image_features = image_features.permute(0, 2, 3, 1).contiguous()
+            # Flatten the spatial dimensions (target_h, target_w) into a single dimension
+            image_features = image_features.flatten(1, 2)
+        return image_features
+    def _forward(self, images):
+        # logger.warning(f"images shape: {images.shape}")
+        with torch.set_grad_enabled(self.unfreeze_mm_vision_tower):
+            image_forward_outs = self.vision_tower.forward(
+                images.to(device=self.device, dtype=self.dtype)
+            )
+            # logger.warning(f"image_forward_outs shape: {image_forward_outs['last_hidden_state'].shape}")
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+            # logger.warning(f"image_features shape: {image_features.shape}")
+            interp_features = self.interpolate(image_features)
+            # logger.warning(f"interp_features shape: {interp_features.shape}")
+            return interp_features
+    @property
+    def num_patches_per_side(self):
+        return int(self.num_patches**0.5)
+    @property
+    def num_patches(self):
+        if self._interp_size is None:
+            return (self._image_size // self._patch_size) ** 2
+        else:
+            return self._interp_size
+# from .siglip_encoder import SiglipVisionTower
+class SiglipVisionTower(BaseVisionTower):
+    def __init__(self, vision_tower_name, args, delay_load=False):
+        super(SiglipVisionTower, self).__init__(vision_tower_name, args, delay_load)
+        model_path = "google/siglip-so400m-patch14-384"
+        base_model_name, res, interp = model_path, 384, 576
+        self.vision_tower_name = base_model_name
+        self._image_size = res if res is not None else 512
+        self._interp_size = interp
+        if not self.delay_load:
+            self.load_model()
+        elif self.unfreeze_mm_vision_tower:
+            self.load_model()
+        else:
+            self._hidden_size = 1152
+    def load_model(self, device_map=None):
+        self.vision_model = "siglip"
+        # clip_model, processor = create_model_from_pretrained(self.vision_tower_name)
+        self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name)
+        # self.vision_tower = clip_model.visual.trunk
+        self.vision_tower.output_tokens = True
+        self._hidden_size = self.vision_tower.config.hidden_size
+        self._image_size = self.vision_tower.config.image_size
+        self._patch_size = self.vision_tower.config.patch_size
+        self.image_processor = SiglipImageProcessor.from_pretrained(
+            self.vision_tower_name
+        )
+        self.vision_tower.requires_grad_(self.unfreeze_mm_vision_tower)
+        self.is_loaded = True
+    def interpolate(self, image_features):
+        if self._interp_size is None:
+            return image_features
+        b, num_tokens, dim = image_features.shape
+        if num_tokens != self.num_patches:
+            target_h = target_w = int(self._interp_size**0.5)
+            h = w = int(num_tokens**0.5)
+            image_features = image_features.view(b, h, w, dim)
+            image_features = image_features.permute(0, 3, 1, 2).contiguous()
+            image_features = F.interpolate(
+                image_features.to(torch.float32),
+                size=(target_h, target_w),
+                mode="bilinear",
+                align_corners=False,
+            ).to(image_features.dtype)
+            # Permute the dimensions back to (b, target_h, target_w, dim)
+            image_features = image_features.permute(0, 2, 3, 1).contiguous()
+            # Flatten the spatial dimensions (target_h, target_w) into a single dimension
+            image_features = image_features.flatten(1, 2)
+        return image_features
+    def _forward(self, images, interpolate_token=576):
+        with torch.set_grad_enabled(self.unfreeze_mm_vision_tower):
+            image_features = self.vision_tower.forward(
+                images.to(device=self.device, dtype=self.dtype),
+                output_hidden_states=True,
+            ).hidden_states[-1]
+            interp_features = self.interpolate(image_features)
+            return interp_features
+def build_vision_tower_aux_list(vision_tower_cfg, **kwargs):
+    vision_tower_aux_name_list = getattr(
+        vision_tower_cfg,
+        "mm_vision_tower_aux_list",
+        getattr(vision_tower_cfg, "vision_tower_aux_list", None),
+    )
+    vision_tower_aux_token_len_list = getattr(
+        vision_tower_cfg,
+        "mm_vision_tower_aux_token_len_list",
+        getattr(vision_tower_cfg, "vision_tower_aux_token_len_list", None),
+    )
+    vision_tower_aux_list = []
+    for vision_tower_aux_name, vision_tower_aux_token_len in zip(
+        vision_tower_aux_name_list, vision_tower_aux_token_len_list
+    ):
+        config = copy.deepcopy(vision_tower_cfg)
+        vision_tower_aux_name += "-interp{}".format(vision_tower_aux_token_len)
+        if "siglip" in vision_tower_aux_name.lower():
+            vision_tower_aux_list.append(
+                SiglipVisionTower(vision_tower_aux_name, args=config, **kwargs)
+            )
+        # SSL-based Vision Towers
+        elif "dinov2" in vision_tower_aux_name.lower():
+            vision_tower_aux_list.append(
+                DinoVisionTower(vision_tower_aux_name, args=config, **kwargs)
+            )
+        else:
+            raise ValueError(f"Unknown vision tower: {vision_tower_aux_name}")
+    return vision_tower_aux_list

multimodal_projector_builder.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# pyre-unsafe
+import re
+import torch.nn as nn
+class IdentityMap(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x, *args, **kwargs):
+        return x
+    @property
+    def config(self):
+        return {"mm_projector_type": "identity"}
+class SimpleResBlock(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(channels)
+        self.proj = nn.Sequential(
+            nn.Linear(channels, channels), nn.GELU(), nn.Linear(channels, channels)
+        )
+    def forward(self, x):
+        x = self.pre_norm(x)
+        return x + self.proj(x)
+def build_vision_projector(config, delay_load=False, **kwargs):
+    projector_type = getattr(config, "mm_projector_type", "linear")
+    config.mm_hidden_size = 256
+    if projector_type == "linear":
+        return nn.Linear(config.mm_hidden_size, config.hidden_size)
+    mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", projector_type)
+    if mlp_gelu_match:
+        mlp_depth = int(mlp_gelu_match.group(1))
+        modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
+        for _ in range(1, mlp_depth):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+        return nn.Sequential(*modules)
+    if projector_type == "identity":
+        return IdentityMap()
+    raise ValueError(f"Unknown projector type: {projector_type}")

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,53 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "processor_class": "LlavaProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

vision_sampler.py ADDED Viewed

	@@ -0,0 +1,566 @@

+import math
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from torch import nn
+# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+class CrossAttention(nn.Module):
+    def __init__(self, q_dim, kv_dim, hidden_dim, num_heads, attention_bias=False):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.num_heads = num_heads
+        self.head_dim = self.hidden_dim // self.num_heads
+        if (self.head_dim * self.num_heads) != self.hidden_dim:
+            raise ValueError(
+                f"hidden_dim must be divisible by num_heads (got `hidden_dim`: {self.hidden_dim}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Sequential(
+            nn.LayerNorm(q_dim),
+            nn.Linear(q_dim, self.num_heads * self.head_dim, bias=attention_bias),
+        )
+        self.k_proj = nn.Sequential(
+            nn.LayerNorm(kv_dim),
+            nn.Linear(kv_dim, self.num_heads * self.head_dim, bias=attention_bias),
+        )
+        self.v_proj = nn.Sequential(
+            nn.LayerNorm(kv_dim),
+            nn.Linear(kv_dim, self.num_heads * self.head_dim, bias=attention_bias),
+        )
+        self.o_proj = nn.Linear(
+            self.num_heads * self.head_dim, q_dim, bias=attention_bias
+        )
+    def forward(self, vision_latents, queries, attention_mask):
+        bsz, q_len, _ = queries.size()
+        bsz, v_len, _ = vision_latents.size()
+        query_states = self.q_proj(queries)
+        key_states = self.k_proj(vision_latents)
+        value_states = self.v_proj(vision_latents)
+        query_states = query_states.view(
+            bsz, q_len, self.num_heads, self.head_dim
+        ).transpose(1, 2)
+        key_states = key_states.view(
+            bsz, v_len, self.num_heads, self.head_dim
+        ).transpose(1, 2)
+        value_states = value_states.view(
+            bsz, v_len, self.num_heads, self.head_dim
+        ).transpose(1, 2)
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, v_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, v_len)}, but is {attention_mask.size()}"
+                )
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_dim)
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+class AggregationBlock(nn.Module):
+    def __init__(
+        self, attention, q_dim, kv_dim, hidden_dim, num_heads, attention_bias=False
+    ):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.num_heads = num_heads
+        self.head_dim = self.hidden_dim // self.num_heads
+        if (self.head_dim * self.num_heads) != self.hidden_dim:
+            raise ValueError(
+                f"hidden_dim must be divisible by num_heads (got `hidden_dim`: {self.hidden_dim}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.attention = attention
+        if attention:
+            self.attention_layer = CrossAttention(
+                q_dim, kv_dim, hidden_dim, num_heads, attention_bias
+            )
+        else:
+            self.attention_layer = MLP(kv_dim, q_dim, q_dim)
+    def forward(self, vision_latents, queries, attention_mask):
+        if self.attention:
+            queries = self.attention_layer(vision_latents, queries, attention_mask)
+        else:
+            queries = self.attention_layer(vision_latents)
+        return queries
+class MultiKVCrossAttention(nn.Module):
+    def __init__(self, q_dim, kv_dim_list, hidden_dim, num_heads, attention_bias=False):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.num_heads = num_heads
+        self.head_dim = self.hidden_dim // self.num_heads
+        if (self.head_dim * self.num_heads) != self.hidden_dim:
+            raise ValueError(
+                f"hidden_dim must be divisible by num_heads (got `hidden_dim`: {self.hidden_dim}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Sequential(
+            nn.LayerNorm(q_dim),
+            nn.Linear(q_dim, self.num_heads * self.head_dim, bias=attention_bias),
+        )
+        self.num_of_kvs = len(kv_dim_list)
+        for i, kv_dim in enumerate(kv_dim_list):
+            setattr(
+                self,
+                "k_proj_{}".format(i),
+                nn.Sequential(
+                    nn.LayerNorm(kv_dim),
+                    nn.Linear(
+                        kv_dim, self.num_heads * self.head_dim, bias=attention_bias
+                    ),
+                ),
+            )
+            setattr(
+                self,
+                "v_proj_{}".format(i),
+                nn.Sequential(
+                    nn.LayerNorm(kv_dim),
+                    nn.Linear(
+                        kv_dim, self.num_heads * self.head_dim, bias=attention_bias
+                    ),
+                ),
+            )
+        self.o_proj = nn.Linear(
+            self.num_heads * self.head_dim, q_dim, bias=attention_bias
+        )
+    def forward(
+        self,
+        queries,
+        *vision_latents_attention_mask_list,
+    ):
+        vision_latents_list = vision_latents_attention_mask_list[: self.num_of_kvs]
+        attention_mask_list = vision_latents_attention_mask_list[self.num_of_kvs :]
+        bsz, q_len, _ = queries.size()
+        query_states = self.q_proj(queries)
+        key_states = torch.cat(
+            [
+                getattr(self, "k_proj_{}".format(i))(vision_latents_list[i])
+                for i in range(self.num_of_kvs)
+            ],
+            dim=1,
+        )
+        value_states = torch.cat(
+            [
+                getattr(self, "v_proj_{}".format(i))(vision_latents_list[i])
+                for i in range(self.num_of_kvs)
+            ],
+            dim=1,
+        )
+        v_len = key_states.shape[1]
+        query_states = query_states.view(
+            bsz, q_len, self.num_heads, self.head_dim
+        ).transpose(1, 2)
+        key_states = key_states.view(
+            bsz, v_len, self.num_heads, self.head_dim
+        ).transpose(1, 2)
+        value_states = value_states.view(
+            bsz, v_len, self.num_heads, self.head_dim
+        ).transpose(1, 2)
+        # if kv_weight is not None:
+        #     kv_weight = kv_weight.unsqueeze(1).expand(-1, self.num_heads, -1, -1)
+        attention_mask = torch.cat(attention_mask_list, dim=-1)
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, v_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, v_len)}, but is {attention_mask.size()}"
+                )
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+        )
+        # attn_output = spda(
+        #     query_states,
+        #     key_states,
+        #     value_states,
+        #     attn_mask=attention_mask,
+        #     additional_score=kv_weight
+        # )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_dim)
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+class MLP(nn.Module):
+    def __init__(self, d_in, d_hidden, d_out):
+        super().__init__()
+        self.linear_1 = nn.Linear(d_in, d_hidden, bias=False)
+        self.act = nn.GELU()
+        self.linear_2 = nn.Linear(d_hidden, d_out, bias=False)
+    def forward(self, x):
+        return self.linear_2(self.act(self.linear_1(x)))
+class VisionCrossAttentionLayer(nn.Module):
+    def __init__(
+        self,
+        q_dim,
+        context_dim,
+        kv_dim_list,
+        kv_size_list,
+        hidden_dim=1024,
+        layer_idx=0,
+    ):
+        super().__init__()
+        num_heads = 16
+        self.num_of_kvs = len(kv_dim_list)
+        self.proj_context = nn.Linear(context_dim, hidden_dim, bias=False)
+        self.proj_in = nn.Linear(q_dim + hidden_dim, hidden_dim, bias=False)
+        # if self.num_of_kvs > 1:
+        #     self.weight_mlp = MLP(q_dim+hidden_dim, hidden_dim, self.num_of_kvs)
+        #     self.tower_weight = nn.Parameter(torch.zeros((self.num_of_kvs)))
+        self.proj_out = MLP(hidden_dim, hidden_dim, q_dim)
+        self.norm = nn.LayerNorm(hidden_dim)
+        self.cross_attn = MultiKVCrossAttention(
+            hidden_dim, kv_dim_list, hidden_dim, num_heads
+        )
+        self.kv_size_list = kv_size_list
+        for i, kv_size in enumerate(kv_size_list):
+            if kv_size > 1:
+                setattr(
+                    self,
+                    "pos_embed_{}".format(i),
+                    nn.Parameter(torch.randn(kv_size**2, hidden_dim)),
+                )
+                # self.register_buffer("pos_embed_{}".format(i), torch.from_numpy(get_2d_sincos_pos_embed(hidden_dim, kv_size)).float(), persistent=False)
+    def forward(
+        self,
+        queries,
+        context_feature,
+        *vision_latents_attention_mask_list,
+    ) -> torch.FloatTensor:
+        residual = queries
+        # queries = self.proj_in(queries)
+        context_feature = self.proj_context(context_feature)
+        # queries = queries + context_feature
+        queries = torch.cat([queries, context_feature], -1)
+        # if self.num_of_kvs > 1:
+        #     kv_weight = self.weight_mlp(queries) # B * 1 * num_tower
+        #     kv_weight = kv_weight + self.tower_weight.view(1, 1, -1)
+        #     kv_weight = kv_weight.softmax(-1)
+        #     kv_number_list = [size**2 for size in self.kv_size_list]
+        #     kv_weight = torch.repeat_interleave(kv_weight, torch.tensor(kv_number_list).to(kv_weight.device), dim=-1)
+        # else:
+        #     kv_weight = None
+        queries = self.proj_in(queries)
+        vision_latents_list = vision_latents_attention_mask_list[: self.num_of_kvs]
+        attention_mask_list = vision_latents_attention_mask_list[self.num_of_kvs :]
+        attention_mask_list_reshaped = []
+        if attention_mask_list is not None:
+            for attention_mask in attention_mask_list:
+                attention_mask = attention_mask.view(attention_mask.shape[0], 1, 1, -1)
+                attention_mask = attention_mask.expand(-1, -1, queries.shape[1], -1)
+                attention_mask_list_reshaped.append(attention_mask)
+        vision_latents_pos_list = []
+        for i, vision_latents in enumerate(vision_latents_list):
+            if vision_latents.shape[1] > 1:
+                vision_latents_pos_list.append(
+                    vision_latents
+                    + getattr(self, "pos_embed_{}".format(i))[None, :, :].to(
+                        vision_latents.dtype
+                    )
+                )
+            else:
+                vision_latents_pos_list.append(vision_latents)
+        # Cross Attention
+        attention_output = self.cross_attn(
+            queries, *vision_latents_pos_list, *attention_mask_list_reshaped
+        )
+        # attention_output = (attention_output * combination_weight).sum(2)
+        queries = queries + attention_output
+        queries = self.norm(queries)
+        queries = self.proj_out(queries)
+        queries = queries + residual
+        return queries
+class VisionAggregationLayer(nn.Module):
+    def __init__(
+        self,
+        q_dim,
+        context_dim,
+        kv_dim_list,
+        kv_size_list,
+        hidden_dim=1024,
+        layer_idx=0,
+    ):
+        super().__init__()
+        num_heads = 16
+        self.num_of_kvs = len(kv_dim_list)
+        self.proj_context = nn.Linear(context_dim, hidden_dim, bias=False)
+        self.proj_in = nn.Linear(q_dim + hidden_dim, hidden_dim, bias=False)
+        self.proj_out = MLP(hidden_dim, hidden_dim, q_dim)
+        self.norm = nn.LayerNorm(hidden_dim)
+        if self.num_of_kvs > 1:
+            self.weight_mlp = MLP(q_dim + hidden_dim, hidden_dim, self.num_of_kvs)
+        for i, kv_size in enumerate(kv_size_list):
+            if kv_size > 1:
+                setattr(
+                    self,
+                    "pos_embed_{}".format(i),
+                    nn.Parameter(torch.randn(kv_size**2, hidden_dim)),
+                )
+                setattr(
+                    self,
+                    "aggregate_{}".format(i),
+                    AggregationBlock(
+                        True, hidden_dim, kv_dim_list[i], hidden_dim, num_heads
+                    ),
+                )
+            else:
+                setattr(
+                    self,
+                    "aggregate_{}".format(i),
+                    AggregationBlock(
+                        False, hidden_dim, kv_dim_list[i], hidden_dim, num_heads
+                    ),
+                )
+    def forward(
+        self,
+        queries,
+        context_feature,
+        *vision_latents_attention_mask_list,
+    ) -> torch.FloatTensor:
+        residual = queries
+        # queries = self.proj_in(queries)
+        context_feature = self.proj_context(context_feature)
+        # queries = queries + context_feature
+        queries = torch.cat([queries, context_feature], -1)
+        if self.num_of_kvs > 1:
+            combination_weight = self.weight_mlp(queries).softmax(
+                -1
+            )  # B * 1 * num_tower
+            combination_weight = combination_weight.unsqueeze(-1)
+        else:
+            combination_weight = 1
+        queries = self.proj_in(queries)
+        vision_latents_list = vision_latents_attention_mask_list[: self.num_of_kvs]
+        attention_mask_list = vision_latents_attention_mask_list[self.num_of_kvs :]
+        attention_mask_list_reshaped = []
+        if attention_mask_list is not None:
+            for attention_mask in attention_mask_list:
+                attention_mask = attention_mask.view(attention_mask.shape[0], 1, 1, -1)
+                attention_mask = attention_mask.expand(-1, -1, queries.shape[1], -1)
+                attention_mask_list_reshaped.append(attention_mask)
+        vision_latents_pos_list = []
+        for i, vision_latents in enumerate(vision_latents_list):
+            if vision_latents.shape[1] > 1:
+                vision_latents_pos_list.append(
+                    vision_latents
+                    + getattr(self, "pos_embed_{}".format(i))[None, :, :].to(
+                        vision_latents.dtype
+                    )
+                )
+            else:
+                vision_latents_pos_list.append(vision_latents)
+        aggregated_vision_latents_list = []
+        for i, (vision_latents, attention_mask) in enumerate(
+            zip(vision_latents_pos_list, attention_mask_list_reshaped)
+        ):
+            aggregated_vision_latents_list.append(
+                getattr(self, "aggregate_{}".format(i))(
+                    vision_latents, queries, attention_mask
+                )
+            )
+        aggregated_vision_latents = torch.stack(aggregated_vision_latents_list, 2)
+        queries = queries + (aggregated_vision_latents * combination_weight).sum(2)
+        queries = self.norm(queries)
+        queries = self.proj_out(queries)
+        queries = queries + residual
+        return queries
+class VisionTokenSampler(nn.Module):
+    def __init__(
+        self,
+        q_dim,
+        context_dim,
+        kv_dim_list,
+        kv_size_list,
+        vision_hidden_size,
+        num_of_layers=1,
+        layer_type="joint",
+    ):
+        super().__init__()
+        assert layer_type in ["joint", "sep"]
+        if layer_type == "joint":
+            self.layers = nn.ModuleList(
+                [
+                    VisionCrossAttentionLayer(
+                        q_dim,
+                        context_dim,
+                        kv_dim_list,
+                        kv_size_list,
+                        vision_hidden_size,
+                        idx,
+                    )
+                    for idx in range(num_of_layers)
+                ]
+            )
+        else:
+            self.layers = nn.ModuleList(
+                [
+                    VisionAggregationLayer(
+                        q_dim,
+                        context_dim,
+                        kv_dim_list,
+                        kv_size_list,
+                        vision_hidden_size,
+                        idx,
+                    )
+                    for idx in range(num_of_layers)
+                ]
+            )
+    def forward(self, queries, context_feature, *vision_latents_attention_mask_list):
+        for layer in self.layers:
+            queries = layer(
+                queries, context_feature, *vision_latents_attention_mask_list
+            )
+        return queries

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff