Spaces:

thongnguyen5999
/

mama

Runtime error

App Files Files Community

Thong Nguyen commited on Oct 7

Commit

c494e29

•

1 Parent(s): 0b91904

fix llava

Browse files

Files changed (19) hide show

app.py +10 -12
flagged/log.csv +2 -0
flagged/video_file/tmpiq5zmz4o.mp4 +0 -0
llava/__pycache__/__init__.cpython-39.pyc +0 -0
llava/__pycache__/conversation.cpython-39.pyc +0 -0
llava/__pycache__/utils.cpython-39.pyc +0 -0
llava/model/__pycache__/builder.cpython-39.pyc +0 -0
llava/model/__pycache__/llava_arch.cpython-39.pyc +0 -0
llava/model/builder.py +1 -2
llava/model/language_model/__pycache__/llava_llama.cpython-39.pyc +0 -0
llava/model/language_model/llava_llama.py +359 -2
llava/model/llava_arch.py +1 -1
llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-39.pyc +0 -0
llava/model/multimodal_encoder/clip_encoder.py +129 -2
requirements.txt +3 -2
video_keyframe_detector/KeyFrameDetector/__pycache__/__init__.cpython-39.pyc +0 -0
video_keyframe_detector/KeyFrameDetector/__pycache__/key_frame_detector.cpython-39.pyc +0 -0
video_keyframe_detector/KeyFrameDetector/__pycache__/utils.cpython-39.pyc +0 -0
video_keyframe_detector/__pycache__/cli.cpython-39.pyc +0 -0

app.py CHANGED Viewed

@@ -23,7 +23,9 @@ from llava.mm_utils import (
     KeywordsStoppingCriteria,
 )
 import torch
 def extract_keyframes(video_path, num_keyframes=12):
     video_id = video_path.split('/')[-1].strip().split('.')[0]
@@ -126,7 +128,7 @@ def eval_model(args, model_name, tokenizer, model, image_processor, context_len)
         images,
         image_processor,
         model.config
-    ).to(model.device, dtype=torch.float32)
     input_ids = (
         tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
@@ -136,14 +138,14 @@ def eval_model(args, model_name, tokenizer, model, image_processor, context_len)
     stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
     keywords = [stop_str]
     stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
     with torch.inference_mode():
         output_ids = model.generate(
             input_ids,
             images=images_tensor,
-            do_sample=True,
             temperature=0.2,
-            max_new_tokens=1024,
             use_cache=True,
             stopping_criteria=[stopping_criteria],
         )
@@ -165,9 +167,6 @@ def eval_model(args, model_name, tokenizer, model, image_processor, context_len)
 def generate_video_caption(video_path):
-    model_path = "liuhaotian/llava-v1.5-7b"
-    model_name = get_model_name_from_path(model_path)
-    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name, device_map="auto", offload_folder="offload_folder")
     video_id = video_path.split('/')[-1].strip().split('.')[0]
     image_file = os.path.join("concatenated_frames", f"{video_id}.jpg")
@@ -181,7 +180,7 @@ def generate_video_caption(video_path):
         "conv_mode": None,
         "image_file": image_file,
         "sep": ",",
-        "max_new_tokens": 1024,
         "temperature": 0.2
     })()
@@ -204,13 +203,12 @@ def video_to_text(video_file):
     return video_caption
 iface = gr.Interface(
     fn=video_to_text,
     inputs=gr.File(file_types=["video"]),
     outputs="text",
-    title="Video to Text Transcription",
     description="Upload a video and get the transcribed text"
 )
-iface.launch()

     KeywordsStoppingCriteria,
 )
 import torch
+model_path = "liuhaotian/llava-v1.5-7b"
+model_name = get_model_name_from_path(model_path)
+tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name, device_map='cpu', offload_folder='offload_folder')
 def extract_keyframes(video_path, num_keyframes=12):
     video_id = video_path.split('/')[-1].strip().split('.')[0]
         images,
         image_processor,
         model.config
+    ).to(model.device, dtype=torch.float16)
     input_ids = (
         tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
     stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
     keywords = [stop_str]
     stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
     with torch.inference_mode():
         output_ids = model.generate(
             input_ids,
             images=images_tensor,
+            do_sample=False,
             temperature=0.2,
+            max_new_tokens=64,
             use_cache=True,
             stopping_criteria=[stopping_criteria],
         )
 def generate_video_caption(video_path):
     video_id = video_path.split('/')[-1].strip().split('.')[0]
     image_file = os.path.join("concatenated_frames", f"{video_id}.jpg")
         "conv_mode": None,
         "image_file": image_file,
         "sep": ",",
+        "max_new_tokens": 64,
         "temperature": 0.2
     })()
     return video_caption
 iface = gr.Interface(
     fn=video_to_text,
     inputs=gr.File(file_types=["video"]),
     outputs="text",
+    title="MAMA Video-Text Generation Pipeline",
     description="Upload a video and get the transcribed text"
 )
+iface.launch(share=True)

flagged/log.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ video_file,output,flag,username,timestamp
2	+ /mnt/data/nguyenpk/mama/flagged/video_file/tmpiq5zmz4o.mp4,,,,2024-10-07 09:21:18.784629

flagged/video_file/tmpiq5zmz4o.mp4 ADDED Viewed

Binary file (351 kB). View file

llava/__pycache__/__init__.cpython-39.pyc CHANGED Viewed

Binary files a/llava/__pycache__/__init__.cpython-39.pyc and b/llava/__pycache__/__init__.cpython-39.pyc differ

llava/__pycache__/conversation.cpython-39.pyc CHANGED Viewed

Binary files a/llava/__pycache__/conversation.cpython-39.pyc and b/llava/__pycache__/conversation.cpython-39.pyc differ

llava/__pycache__/utils.cpython-39.pyc CHANGED Viewed

Binary files a/llava/__pycache__/utils.cpython-39.pyc and b/llava/__pycache__/utils.cpython-39.pyc differ

llava/model/__pycache__/builder.cpython-39.pyc CHANGED Viewed

Binary files a/llava/model/__pycache__/builder.cpython-39.pyc and b/llava/model/__pycache__/builder.cpython-39.pyc differ

llava/model/__pycache__/llava_arch.cpython-39.pyc CHANGED Viewed

Binary files a/llava/model/__pycache__/llava_arch.cpython-39.pyc and b/llava/model/__pycache__/llava_arch.cpython-39.pyc differ

llava/model/builder.py CHANGED Viewed

@@ -40,8 +40,7 @@ def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, l
             bnb_4bit_quant_type='nf4'
         )
     else:
-        kwargs['torch_dtype'] = torch.float32
-        # kwargs['torch_dtype'] = torch.float16
     if use_flash_attn:
         kwargs['attn_implementation'] = 'flash_attention_2'

             bnb_4bit_quant_type='nf4'
         )
     else:
+        kwargs['torch_dtype'] = torch.float16
     if use_flash_attn:
         kwargs['attn_implementation'] = 'flash_attention_2'

llava/model/language_model/__pycache__/llava_llama.cpython-39.pyc CHANGED Viewed

Binary files a/llava/model/language_model/__pycache__/llava_llama.cpython-39.pyc and b/llava/model/language_model/__pycache__/llava_llama.cpython-39.pyc differ

llava/model/language_model/llava_llama.py CHANGED Viewed

@@ -17,14 +17,370 @@ from typing import List, Optional, Tuple, Union
 import torch
 import torch.nn as nn
 from transformers import AutoConfig, AutoModelForCausalLM, \
-                         LlamaConfig, LlamaModel, LlamaForCausalLM
-from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.generation.utils import GenerateOutput
 from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
 class LlavaConfig(LlamaConfig):
@@ -68,6 +424,7 @@ class LlavaLlamaForCausalLM(LlamaForCausalLM, LlavaMetaForCausalLM):
         images: Optional[torch.FloatTensor] = None,
         image_sizes: Optional[List[List[int]]] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         if inputs_embeds is None:

 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from transformers import AutoConfig, AutoModelForCausalLM, \
+                         LlamaConfig, LlamaForCausalLM, LlamaPreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithPast, BaseModelOutputWithPast
 from transformers.generation.utils import GenerateOutput
 from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
+from transformers.models.llama.modeling_llama import LlamaDecoderLayer, LlamaRMSNorm
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask_for_sdpa, _prepare_4d_causal_attention_mask
+from transformers.cache_utils import Cache, DynamicCache
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class LlamaModel(LlamaPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+    Args:
+        config: LlamaConfig
+    """
+    def __init__(self, config: LlamaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._use_sdpa = config._attn_implementation == "sdpa"
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape[:2]
+        elif inputs_embeds is not None:
+            batch_size, seq_length = inputs_embeds.shape[:2]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        past_key_values_length = 0
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0)
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if self._use_flash_attention_2:
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self._use_sdpa and not output_attentions:
+            # output_attentions=True can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+            )
+        # embed positions
+        hidden_states = inputs_embeds
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer.float()(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class LlamaForCausalLM(LlamaPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = LlamaModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, LlamaForCausalLM
+        >>> model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
+        >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        if self.config.pretraining_tp > 1:
+            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
+            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
+            logits = torch.cat(logits, dim=-1)
+        else:
+            logits = self.lm_head.float()(hidden_states)
+        logits = logits.float()
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
 class LlavaConfig(LlamaConfig):
         images: Optional[torch.FloatTensor] = None,
         image_sizes: Optional[List[List[int]]] = None,
         return_dict: Optional[bool] = None,
+        cache_position = None
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         if inputs_embeds is None:

llava/model/llava_arch.py CHANGED Viewed

@@ -139,7 +139,7 @@ class LlavaMetaForCausalLM(ABC):
     def encode_images(self, images):
         image_features = self.get_model().get_vision_tower()(images)
-        image_features = self.get_model().mm_projector(image_features)
         return image_features
     def prepare_inputs_labels_for_multimodal(

     def encode_images(self, images):
         image_features = self.get_model().get_vision_tower()(images)
+        image_features = self.get_model().mm_projector.float()(image_features.float())
         return image_features
     def prepare_inputs_labels_for_multimodal(

llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-39.pyc CHANGED Viewed

Binary files a/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-39.pyc and b/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-39.pyc differ

llava/model/multimodal_encoder/clip_encoder.py CHANGED Viewed

@@ -1,7 +1,134 @@
 import torch
 import torch.nn as nn
-from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
 class CLIPVisionTower(nn.Module):
@@ -51,7 +178,7 @@ class CLIPVisionTower(nn.Module):
                 image_feature = self.feature_select(image_forward_out).to(image.dtype)
                 image_features.append(image_feature)
         else:
-            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
             image_features = self.feature_select(image_forward_outs).to(images.dtype)
         return image_features

 import torch
 import torch.nn as nn
+from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig, CLIPPreTrainedModel
+from transformers.models.clip.modeling_clip import CLIPEncoder
+from typing import Any, Optional, Tuple, Union
+from transformers.modeling_outputs import BaseModelOutputWithPooling
+class CLIPVisionEmbeddings(nn.Module):
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias=False,
+        )
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding.float()(pixel_values)  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.to(target_dtype)
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + self.position_embedding(self.position_ids)
+        return embeddings
+class CLIPVisionTransformer(nn.Module):
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = CLIPVisionEmbeddings(config)
+        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.encoder = CLIPEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.pre_layrnorm.float()(hidden_states.float())
+        encoder_outputs = self.encoder.float()(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm.float()(pooled_output)
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+class CLIPVisionModel(CLIPPreTrainedModel):
+    config_class = CLIPVisionConfig
+    main_input_name = "pixel_values"
+    _no_split_modules = ["CLIPEncoderLayer"]
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__(config)
+        self.vision_model = CLIPVisionTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
 class CLIPVisionTower(nn.Module):
                 image_feature = self.feature_select(image_forward_out).to(image.dtype)
                 image_features.append(image_feature)
         else:
+            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype).float(), output_hidden_states=True)
             image_features = self.feature_select(image_forward_outs).to(images.dtype)
         return image_features

requirements.txt CHANGED Viewed

@@ -6,6 +6,7 @@ torchvision==0.16.2
 peakutils
 matplotlib
 protobuf
-transformers
 sentencepiece
-accelerate>=0.26.0

 peakutils
 matplotlib
 protobuf
 sentencepiece
+accelerate>=0.26.0
+bitsandbytes
+transformers==4.37.2

video_keyframe_detector/KeyFrameDetector/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (250 Bytes). View file

video_keyframe_detector/KeyFrameDetector/__pycache__/key_frame_detector.cpython-39.pyc ADDED Viewed

Binary file (1.94 kB). View file

video_keyframe_detector/KeyFrameDetector/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (2.07 kB). View file

video_keyframe_detector/__pycache__/cli.cpython-39.pyc ADDED Viewed

Binary file (744 Bytes). View file