Thong Nguyen commited on
Commit
c494e29
1 Parent(s): 0b91904
app.py CHANGED
@@ -23,7 +23,9 @@ from llava.mm_utils import (
23
  KeywordsStoppingCriteria,
24
  )
25
  import torch
26
-
 
 
27
 
28
  def extract_keyframes(video_path, num_keyframes=12):
29
  video_id = video_path.split('/')[-1].strip().split('.')[0]
@@ -126,7 +128,7 @@ def eval_model(args, model_name, tokenizer, model, image_processor, context_len)
126
  images,
127
  image_processor,
128
  model.config
129
- ).to(model.device, dtype=torch.float32)
130
 
131
  input_ids = (
132
  tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
@@ -136,14 +138,14 @@ def eval_model(args, model_name, tokenizer, model, image_processor, context_len)
136
  stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
137
  keywords = [stop_str]
138
  stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
139
-
140
  with torch.inference_mode():
141
  output_ids = model.generate(
142
  input_ids,
143
  images=images_tensor,
144
- do_sample=True,
145
  temperature=0.2,
146
- max_new_tokens=1024,
147
  use_cache=True,
148
  stopping_criteria=[stopping_criteria],
149
  )
@@ -165,9 +167,6 @@ def eval_model(args, model_name, tokenizer, model, image_processor, context_len)
165
 
166
 
167
  def generate_video_caption(video_path):
168
- model_path = "liuhaotian/llava-v1.5-7b"
169
- model_name = get_model_name_from_path(model_path)
170
- tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name, device_map="auto", offload_folder="offload_folder")
171
  video_id = video_path.split('/')[-1].strip().split('.')[0]
172
 
173
  image_file = os.path.join("concatenated_frames", f"{video_id}.jpg")
@@ -181,7 +180,7 @@ def generate_video_caption(video_path):
181
  "conv_mode": None,
182
  "image_file": image_file,
183
  "sep": ",",
184
- "max_new_tokens": 1024,
185
  "temperature": 0.2
186
  })()
187
 
@@ -204,13 +203,12 @@ def video_to_text(video_file):
204
 
205
  return video_caption
206
 
207
-
208
  iface = gr.Interface(
209
  fn=video_to_text,
210
  inputs=gr.File(file_types=["video"]),
211
  outputs="text",
212
- title="Video to Text Transcription",
213
  description="Upload a video and get the transcribed text"
214
  )
215
 
216
- iface.launch()
 
23
  KeywordsStoppingCriteria,
24
  )
25
  import torch
26
+ model_path = "liuhaotian/llava-v1.5-7b"
27
+ model_name = get_model_name_from_path(model_path)
28
+ tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name, device_map='cpu', offload_folder='offload_folder')
29
 
30
  def extract_keyframes(video_path, num_keyframes=12):
31
  video_id = video_path.split('/')[-1].strip().split('.')[0]
 
128
  images,
129
  image_processor,
130
  model.config
131
+ ).to(model.device, dtype=torch.float16)
132
 
133
  input_ids = (
134
  tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
 
138
  stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
139
  keywords = [stop_str]
140
  stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
141
+
142
  with torch.inference_mode():
143
  output_ids = model.generate(
144
  input_ids,
145
  images=images_tensor,
146
+ do_sample=False,
147
  temperature=0.2,
148
+ max_new_tokens=64,
149
  use_cache=True,
150
  stopping_criteria=[stopping_criteria],
151
  )
 
167
 
168
 
169
  def generate_video_caption(video_path):
 
 
 
170
  video_id = video_path.split('/')[-1].strip().split('.')[0]
171
 
172
  image_file = os.path.join("concatenated_frames", f"{video_id}.jpg")
 
180
  "conv_mode": None,
181
  "image_file": image_file,
182
  "sep": ",",
183
+ "max_new_tokens": 64,
184
  "temperature": 0.2
185
  })()
186
 
 
203
 
204
  return video_caption
205
 
 
206
  iface = gr.Interface(
207
  fn=video_to_text,
208
  inputs=gr.File(file_types=["video"]),
209
  outputs="text",
210
+ title="MAMA Video-Text Generation Pipeline",
211
  description="Upload a video and get the transcribed text"
212
  )
213
 
214
+ iface.launch(share=True)
flagged/log.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ video_file,output,flag,username,timestamp
2
+ /mnt/data/nguyenpk/mama/flagged/video_file/tmpiq5zmz4o.mp4,,,,2024-10-07 09:21:18.784629
flagged/video_file/tmpiq5zmz4o.mp4 ADDED
Binary file (351 kB). View file
 
llava/__pycache__/__init__.cpython-39.pyc CHANGED
Binary files a/llava/__pycache__/__init__.cpython-39.pyc and b/llava/__pycache__/__init__.cpython-39.pyc differ
 
llava/__pycache__/conversation.cpython-39.pyc CHANGED
Binary files a/llava/__pycache__/conversation.cpython-39.pyc and b/llava/__pycache__/conversation.cpython-39.pyc differ
 
llava/__pycache__/utils.cpython-39.pyc CHANGED
Binary files a/llava/__pycache__/utils.cpython-39.pyc and b/llava/__pycache__/utils.cpython-39.pyc differ
 
llava/model/__pycache__/builder.cpython-39.pyc CHANGED
Binary files a/llava/model/__pycache__/builder.cpython-39.pyc and b/llava/model/__pycache__/builder.cpython-39.pyc differ
 
llava/model/__pycache__/llava_arch.cpython-39.pyc CHANGED
Binary files a/llava/model/__pycache__/llava_arch.cpython-39.pyc and b/llava/model/__pycache__/llava_arch.cpython-39.pyc differ
 
llava/model/builder.py CHANGED
@@ -40,8 +40,7 @@ def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, l
40
  bnb_4bit_quant_type='nf4'
41
  )
42
  else:
43
- kwargs['torch_dtype'] = torch.float32
44
- # kwargs['torch_dtype'] = torch.float16
45
 
46
  if use_flash_attn:
47
  kwargs['attn_implementation'] = 'flash_attention_2'
 
40
  bnb_4bit_quant_type='nf4'
41
  )
42
  else:
43
+ kwargs['torch_dtype'] = torch.float16
 
44
 
45
  if use_flash_attn:
46
  kwargs['attn_implementation'] = 'flash_attention_2'
llava/model/language_model/__pycache__/llava_llama.cpython-39.pyc CHANGED
Binary files a/llava/model/language_model/__pycache__/llava_llama.cpython-39.pyc and b/llava/model/language_model/__pycache__/llava_llama.cpython-39.pyc differ
 
llava/model/language_model/llava_llama.py CHANGED
@@ -17,14 +17,370 @@ from typing import List, Optional, Tuple, Union
17
 
18
  import torch
19
  import torch.nn as nn
 
20
 
21
  from transformers import AutoConfig, AutoModelForCausalLM, \
22
- LlamaConfig, LlamaModel, LlamaForCausalLM
23
 
24
- from transformers.modeling_outputs import CausalLMOutputWithPast
25
  from transformers.generation.utils import GenerateOutput
26
 
27
  from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
 
30
  class LlavaConfig(LlamaConfig):
@@ -68,6 +424,7 @@ class LlavaLlamaForCausalLM(LlamaForCausalLM, LlavaMetaForCausalLM):
68
  images: Optional[torch.FloatTensor] = None,
69
  image_sizes: Optional[List[List[int]]] = None,
70
  return_dict: Optional[bool] = None,
 
71
  ) -> Union[Tuple, CausalLMOutputWithPast]:
72
 
73
  if inputs_embeds is None:
 
17
 
18
  import torch
19
  import torch.nn as nn
20
+ import torch.nn.functional as F
21
 
22
  from transformers import AutoConfig, AutoModelForCausalLM, \
23
+ LlamaConfig, LlamaForCausalLM, LlamaPreTrainedModel
24
 
25
+ from transformers.modeling_outputs import CausalLMOutputWithPast, BaseModelOutputWithPast
26
  from transformers.generation.utils import GenerateOutput
27
 
28
  from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
29
+ from transformers.models.llama.modeling_llama import LlamaDecoderLayer, LlamaRMSNorm
30
+ from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask_for_sdpa, _prepare_4d_causal_attention_mask
31
+ from transformers.cache_utils import Cache, DynamicCache
32
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
33
+ from transformers.utils import logging
34
+ logger = logging.get_logger(__name__)
35
+
36
+
37
+ class LlamaModel(LlamaPreTrainedModel):
38
+ """
39
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
40
+
41
+ Args:
42
+ config: LlamaConfig
43
+ """
44
+
45
+ def __init__(self, config: LlamaConfig):
46
+ super().__init__(config)
47
+ self.padding_idx = config.pad_token_id
48
+ self.vocab_size = config.vocab_size
49
+
50
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
51
+ self.layers = nn.ModuleList(
52
+ [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
53
+ )
54
+ self._use_sdpa = config._attn_implementation == "sdpa"
55
+ self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
56
+ self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
57
+
58
+ self.gradient_checkpointing = False
59
+ # Initialize weights and apply final processing
60
+ self.post_init()
61
+
62
+ def get_input_embeddings(self):
63
+ return self.embed_tokens
64
+
65
+ def set_input_embeddings(self, value):
66
+ self.embed_tokens = value
67
+
68
+ def forward(
69
+ self,
70
+ input_ids: torch.LongTensor = None,
71
+ attention_mask: Optional[torch.Tensor] = None,
72
+ position_ids: Optional[torch.LongTensor] = None,
73
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
74
+ inputs_embeds: Optional[torch.FloatTensor] = None,
75
+ use_cache: Optional[bool] = None,
76
+ output_attentions: Optional[bool] = None,
77
+ output_hidden_states: Optional[bool] = None,
78
+ return_dict: Optional[bool] = None,
79
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
80
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
81
+ output_hidden_states = (
82
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
83
+ )
84
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
85
+
86
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
87
+
88
+ # retrieve input_ids and inputs_embeds
89
+ if input_ids is not None and inputs_embeds is not None:
90
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
91
+ elif input_ids is not None:
92
+ batch_size, seq_length = input_ids.shape[:2]
93
+ elif inputs_embeds is not None:
94
+ batch_size, seq_length = inputs_embeds.shape[:2]
95
+ else:
96
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
97
+
98
+ if self.gradient_checkpointing and self.training:
99
+ if use_cache:
100
+ logger.warning_once(
101
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
102
+ )
103
+ use_cache = False
104
+
105
+ past_key_values_length = 0
106
+ if use_cache:
107
+ use_legacy_cache = not isinstance(past_key_values, Cache)
108
+ if use_legacy_cache:
109
+ past_key_values = DynamicCache.from_legacy_cache(past_key_values)
110
+ past_key_values_length = past_key_values.get_usable_length(seq_length)
111
+
112
+ if position_ids is None:
113
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
114
+ position_ids = torch.arange(
115
+ past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
116
+ )
117
+ position_ids = position_ids.unsqueeze(0)
118
+
119
+ if inputs_embeds is None:
120
+ inputs_embeds = self.embed_tokens(input_ids)
121
+
122
+ if self._use_flash_attention_2:
123
+ # 2d mask is passed through the layers
124
+ attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
125
+ elif self._use_sdpa and not output_attentions:
126
+ # output_attentions=True can not be supported when using SDPA, and we fall back on
127
+ # the manual implementation that requires a 4D causal mask in all cases.
128
+ attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
129
+ attention_mask,
130
+ (batch_size, seq_length),
131
+ inputs_embeds,
132
+ past_key_values_length,
133
+ )
134
+ else:
135
+ # 4d mask is passed through the layers
136
+ attention_mask = _prepare_4d_causal_attention_mask(
137
+ attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
138
+ )
139
+
140
+ # embed positions
141
+ hidden_states = inputs_embeds
142
+
143
+ # decoder layers
144
+ all_hidden_states = () if output_hidden_states else None
145
+ all_self_attns = () if output_attentions else None
146
+ next_decoder_cache = None
147
+
148
+ for decoder_layer in self.layers:
149
+ if output_hidden_states:
150
+ all_hidden_states += (hidden_states,)
151
+
152
+ if self.gradient_checkpointing and self.training:
153
+ layer_outputs = self._gradient_checkpointing_func(
154
+ decoder_layer.__call__,
155
+ hidden_states,
156
+ attention_mask,
157
+ position_ids,
158
+ past_key_values,
159
+ output_attentions,
160
+ use_cache,
161
+ )
162
+ else:
163
+ layer_outputs = decoder_layer.float()(
164
+ hidden_states,
165
+ attention_mask=attention_mask,
166
+ position_ids=position_ids,
167
+ past_key_value=past_key_values,
168
+ output_attentions=output_attentions,
169
+ use_cache=use_cache,
170
+ )
171
+
172
+ hidden_states = layer_outputs[0]
173
+
174
+ if use_cache:
175
+ next_decoder_cache = layer_outputs[2 if output_attentions else 1]
176
+
177
+ if output_attentions:
178
+ all_self_attns += (layer_outputs[1],)
179
+
180
+ hidden_states = self.norm(hidden_states)
181
+
182
+ # add hidden states from the last decoder layer
183
+ if output_hidden_states:
184
+ all_hidden_states += (hidden_states,)
185
+
186
+ next_cache = None
187
+ if use_cache:
188
+ next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
189
+ if not return_dict:
190
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
191
+ return BaseModelOutputWithPast(
192
+ last_hidden_state=hidden_states,
193
+ past_key_values=next_cache,
194
+ hidden_states=all_hidden_states,
195
+ attentions=all_self_attns,
196
+ )
197
+
198
+
199
+ class LlamaForCausalLM(LlamaPreTrainedModel):
200
+ _tied_weights_keys = ["lm_head.weight"]
201
+
202
+ def __init__(self, config):
203
+ super().__init__(config)
204
+ self.model = LlamaModel(config)
205
+ self.vocab_size = config.vocab_size
206
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
207
+
208
+ # Initialize weights and apply final processing
209
+ self.post_init()
210
+
211
+ def get_input_embeddings(self):
212
+ return self.model.embed_tokens
213
+
214
+ def set_input_embeddings(self, value):
215
+ self.model.embed_tokens = value
216
+
217
+ def get_output_embeddings(self):
218
+ return self.lm_head
219
+
220
+ def set_output_embeddings(self, new_embeddings):
221
+ self.lm_head = new_embeddings
222
+
223
+ def set_decoder(self, decoder):
224
+ self.model = decoder
225
+
226
+ def get_decoder(self):
227
+ return self.model
228
+
229
+ def forward(
230
+ self,
231
+ input_ids: torch.LongTensor = None,
232
+ attention_mask: Optional[torch.Tensor] = None,
233
+ position_ids: Optional[torch.LongTensor] = None,
234
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
235
+ inputs_embeds: Optional[torch.FloatTensor] = None,
236
+ labels: Optional[torch.LongTensor] = None,
237
+ use_cache: Optional[bool] = None,
238
+ output_attentions: Optional[bool] = None,
239
+ output_hidden_states: Optional[bool] = None,
240
+ return_dict: Optional[bool] = None,
241
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
242
+ r"""
243
+ Args:
244
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
245
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
246
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
247
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
248
+
249
+ Returns:
250
+
251
+ Example:
252
+
253
+ ```python
254
+ >>> from transformers import AutoTokenizer, LlamaForCausalLM
255
+
256
+ >>> model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
257
+ >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
258
+
259
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
260
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
261
+
262
+ >>> # Generate
263
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
264
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
265
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
266
+ ```"""
267
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
268
+ output_hidden_states = (
269
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
270
+ )
271
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
272
+
273
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
274
+ outputs = self.model(
275
+ input_ids=input_ids,
276
+ attention_mask=attention_mask,
277
+ position_ids=position_ids,
278
+ past_key_values=past_key_values,
279
+ inputs_embeds=inputs_embeds,
280
+ use_cache=use_cache,
281
+ output_attentions=output_attentions,
282
+ output_hidden_states=output_hidden_states,
283
+ return_dict=return_dict,
284
+ )
285
+
286
+ hidden_states = outputs[0]
287
+ if self.config.pretraining_tp > 1:
288
+ lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
289
+ logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
290
+ logits = torch.cat(logits, dim=-1)
291
+ else:
292
+ logits = self.lm_head.float()(hidden_states)
293
+ logits = logits.float()
294
+
295
+ loss = None
296
+ if labels is not None:
297
+ # Shift so that tokens < n predict n
298
+ shift_logits = logits[..., :-1, :].contiguous()
299
+ shift_labels = labels[..., 1:].contiguous()
300
+ # Flatten the tokens
301
+ loss_fct = CrossEntropyLoss()
302
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
303
+ shift_labels = shift_labels.view(-1)
304
+ # Enable model parallelism
305
+ shift_labels = shift_labels.to(shift_logits.device)
306
+ loss = loss_fct(shift_logits, shift_labels)
307
+
308
+ if not return_dict:
309
+ output = (logits,) + outputs[1:]
310
+ return (loss,) + output if loss is not None else output
311
+
312
+ return CausalLMOutputWithPast(
313
+ loss=loss,
314
+ logits=logits,
315
+ past_key_values=outputs.past_key_values,
316
+ hidden_states=outputs.hidden_states,
317
+ attentions=outputs.attentions,
318
+ )
319
+
320
+ def prepare_inputs_for_generation(
321
+ self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
322
+ ):
323
+ if past_key_values is not None:
324
+ if isinstance(past_key_values, Cache):
325
+ cache_length = past_key_values.get_seq_length()
326
+ past_length = past_key_values.seen_tokens
327
+ max_cache_length = past_key_values.get_max_length()
328
+ else:
329
+ cache_length = past_length = past_key_values[0][0].shape[2]
330
+ max_cache_length = None
331
+
332
+ # Keep only the unprocessed tokens:
333
+ # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
334
+ # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
335
+ # input)
336
+ if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
337
+ input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
338
+ # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
339
+ # input_ids based on the past_length.
340
+ elif past_length < input_ids.shape[1]:
341
+ input_ids = input_ids[:, past_length:]
342
+ # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
343
+
344
+ # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
345
+ if (
346
+ max_cache_length is not None
347
+ and attention_mask is not None
348
+ and cache_length + input_ids.shape[1] > max_cache_length
349
+ ):
350
+ attention_mask = attention_mask[:, -max_cache_length:]
351
+
352
+ position_ids = kwargs.get("position_ids", None)
353
+ if attention_mask is not None and position_ids is None:
354
+ # create position_ids on the fly for batch generation
355
+ position_ids = attention_mask.long().cumsum(-1) - 1
356
+ position_ids.masked_fill_(attention_mask == 0, 1)
357
+ if past_key_values:
358
+ position_ids = position_ids[:, -input_ids.shape[1] :]
359
+
360
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
361
+ if inputs_embeds is not None and past_key_values is None:
362
+ model_inputs = {"inputs_embeds": inputs_embeds}
363
+ else:
364
+ model_inputs = {"input_ids": input_ids}
365
+
366
+ model_inputs.update(
367
+ {
368
+ "position_ids": position_ids,
369
+ "past_key_values": past_key_values,
370
+ "use_cache": kwargs.get("use_cache"),
371
+ "attention_mask": attention_mask,
372
+ }
373
+ )
374
+ return model_inputs
375
+
376
+ @staticmethod
377
+ def _reorder_cache(past_key_values, beam_idx):
378
+ reordered_past = ()
379
+ for layer_past in past_key_values:
380
+ reordered_past += (
381
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
382
+ )
383
+ return reordered_past
384
 
385
 
386
  class LlavaConfig(LlamaConfig):
 
424
  images: Optional[torch.FloatTensor] = None,
425
  image_sizes: Optional[List[List[int]]] = None,
426
  return_dict: Optional[bool] = None,
427
+ cache_position = None
428
  ) -> Union[Tuple, CausalLMOutputWithPast]:
429
 
430
  if inputs_embeds is None:
llava/model/llava_arch.py CHANGED
@@ -139,7 +139,7 @@ class LlavaMetaForCausalLM(ABC):
139
 
140
  def encode_images(self, images):
141
  image_features = self.get_model().get_vision_tower()(images)
142
- image_features = self.get_model().mm_projector(image_features)
143
  return image_features
144
 
145
  def prepare_inputs_labels_for_multimodal(
 
139
 
140
  def encode_images(self, images):
141
  image_features = self.get_model().get_vision_tower()(images)
142
+ image_features = self.get_model().mm_projector.float()(image_features.float())
143
  return image_features
144
 
145
  def prepare_inputs_labels_for_multimodal(
llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-39.pyc CHANGED
Binary files a/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-39.pyc and b/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-39.pyc differ
 
llava/model/multimodal_encoder/clip_encoder.py CHANGED
@@ -1,7 +1,134 @@
1
  import torch
2
  import torch.nn as nn
3
 
4
- from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
 
7
  class CLIPVisionTower(nn.Module):
@@ -51,7 +178,7 @@ class CLIPVisionTower(nn.Module):
51
  image_feature = self.feature_select(image_forward_out).to(image.dtype)
52
  image_features.append(image_feature)
53
  else:
54
- image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
55
  image_features = self.feature_select(image_forward_outs).to(images.dtype)
56
 
57
  return image_features
 
1
  import torch
2
  import torch.nn as nn
3
 
4
+ from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig, CLIPPreTrainedModel
5
+ from transformers.models.clip.modeling_clip import CLIPEncoder
6
+ from typing import Any, Optional, Tuple, Union
7
+ from transformers.modeling_outputs import BaseModelOutputWithPooling
8
+
9
+
10
+ class CLIPVisionEmbeddings(nn.Module):
11
+ def __init__(self, config: CLIPVisionConfig):
12
+ super().__init__()
13
+ self.config = config
14
+ self.embed_dim = config.hidden_size
15
+ self.image_size = config.image_size
16
+ self.patch_size = config.patch_size
17
+
18
+ self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
19
+
20
+ self.patch_embedding = nn.Conv2d(
21
+ in_channels=config.num_channels,
22
+ out_channels=self.embed_dim,
23
+ kernel_size=self.patch_size,
24
+ stride=self.patch_size,
25
+ bias=False,
26
+ )
27
+
28
+ self.num_patches = (self.image_size // self.patch_size) ** 2
29
+ self.num_positions = self.num_patches + 1
30
+ self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
31
+ self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
32
+
33
+ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
34
+ batch_size = pixel_values.shape[0]
35
+ target_dtype = self.patch_embedding.weight.dtype
36
+ patch_embeds = self.patch_embedding.float()(pixel_values) # shape = [*, width, grid, grid]
37
+ patch_embeds = patch_embeds.to(target_dtype)
38
+ patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
39
+
40
+ class_embeds = self.class_embedding.expand(batch_size, 1, -1)
41
+ embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
42
+ embeddings = embeddings + self.position_embedding(self.position_ids)
43
+ return embeddings
44
+
45
+
46
+ class CLIPVisionTransformer(nn.Module):
47
+ def __init__(self, config: CLIPVisionConfig):
48
+ super().__init__()
49
+ self.config = config
50
+ embed_dim = config.hidden_size
51
+
52
+ self.embeddings = CLIPVisionEmbeddings(config)
53
+ self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
54
+ self.encoder = CLIPEncoder(config)
55
+ self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
56
+
57
+ def forward(
58
+ self,
59
+ pixel_values: Optional[torch.FloatTensor] = None,
60
+ output_attentions: Optional[bool] = None,
61
+ output_hidden_states: Optional[bool] = None,
62
+ return_dict: Optional[bool] = None,
63
+ ):
64
+ r"""
65
+ Returns:
66
+
67
+ """
68
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
69
+ output_hidden_states = (
70
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
71
+ )
72
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
73
+
74
+ if pixel_values is None:
75
+ raise ValueError("You have to specify pixel_values")
76
+
77
+ hidden_states = self.embeddings(pixel_values)
78
+ hidden_states = self.pre_layrnorm.float()(hidden_states.float())
79
+
80
+ encoder_outputs = self.encoder.float()(
81
+ inputs_embeds=hidden_states,
82
+ output_attentions=output_attentions,
83
+ output_hidden_states=output_hidden_states,
84
+ return_dict=return_dict,
85
+ )
86
+
87
+ last_hidden_state = encoder_outputs[0]
88
+ pooled_output = last_hidden_state[:, 0, :]
89
+ pooled_output = self.post_layernorm.float()(pooled_output)
90
+
91
+ if not return_dict:
92
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
93
+
94
+ return BaseModelOutputWithPooling(
95
+ last_hidden_state=last_hidden_state,
96
+ pooler_output=pooled_output,
97
+ hidden_states=encoder_outputs.hidden_states,
98
+ attentions=encoder_outputs.attentions,
99
+ )
100
+
101
+
102
+ class CLIPVisionModel(CLIPPreTrainedModel):
103
+ config_class = CLIPVisionConfig
104
+ main_input_name = "pixel_values"
105
+ _no_split_modules = ["CLIPEncoderLayer"]
106
+
107
+ def __init__(self, config: CLIPVisionConfig):
108
+ super().__init__(config)
109
+ self.vision_model = CLIPVisionTransformer(config)
110
+ # Initialize weights and apply final processing
111
+ self.post_init()
112
+
113
+ def get_input_embeddings(self) -> nn.Module:
114
+ return self.vision_model.embeddings.patch_embedding
115
+
116
+ def forward(
117
+ self,
118
+ pixel_values: Optional[torch.FloatTensor] = None,
119
+ output_attentions: Optional[bool] = None,
120
+ output_hidden_states: Optional[bool] = None,
121
+ return_dict: Optional[bool] = None,
122
+ ):
123
+
124
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
125
+
126
+ return self.vision_model(
127
+ pixel_values=pixel_values,
128
+ output_attentions=output_attentions,
129
+ output_hidden_states=output_hidden_states,
130
+ return_dict=return_dict,
131
+ )
132
 
133
 
134
  class CLIPVisionTower(nn.Module):
 
178
  image_feature = self.feature_select(image_forward_out).to(image.dtype)
179
  image_features.append(image_feature)
180
  else:
181
+ image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype).float(), output_hidden_states=True)
182
  image_features = self.feature_select(image_forward_outs).to(images.dtype)
183
 
184
  return image_features
requirements.txt CHANGED
@@ -6,6 +6,7 @@ torchvision==0.16.2
6
  peakutils
7
  matplotlib
8
  protobuf
9
- transformers
10
  sentencepiece
11
- accelerate>=0.26.0
 
 
 
6
  peakutils
7
  matplotlib
8
  protobuf
 
9
  sentencepiece
10
+ accelerate>=0.26.0
11
+ bitsandbytes
12
+ transformers==4.37.2
video_keyframe_detector/KeyFrameDetector/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (250 Bytes). View file
 
video_keyframe_detector/KeyFrameDetector/__pycache__/key_frame_detector.cpython-39.pyc ADDED
Binary file (1.94 kB). View file
 
video_keyframe_detector/KeyFrameDetector/__pycache__/utils.cpython-39.pyc ADDED
Binary file (2.07 kB). View file
 
video_keyframe_detector/__pycache__/cli.cpython-39.pyc ADDED
Binary file (744 Bytes). View file