teowu commited on
Commit
241c337
1 Parent(s): a118490

Upload modeling_mplug_owl2.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. modeling_mplug_owl2.py +2 -2
modeling_mplug_owl2.py CHANGED
@@ -275,14 +275,14 @@ class MPLUGOwl2LlamaForCausalLM(LlamaForCausalLM, MPLUGOwl2MetaForCausalLM):
275
  input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device)
276
  with torch.inference_mode():
277
  image_tensor = self.image_processor.preprocess(images, return_tensors="pt")["pixel_values"].half().to(self.device)
278
- output_logits = model(input_ids.repeat(image_tensor.shape[0], 1),
279
  images=image_tensor)["logits"][:,-1, self.preferential_ids_]
280
  return torch.softmax(output_logits, -1) @ self.weight_tensor
281
  else:
282
  video = [[expand2square(frame, tuple(int(x*255) for x in self.image_processor.image_mean)) for frame in vid] for vid in images]
283
  with torch.inference_mode():
284
  video_tensors = [self.image_processor.preprocess(vid, return_tensors="pt")["pixel_values"].half().to(self.model.device) for vid in video]
285
- output_logits = self.model(self.input_ids.repeat(len(video_tensors), 1),
286
  images=video_tensors)["logits"][:,-1, self.preferential_ids_]
287
  return torch.softmax(output_logits, -1) @ self.weight_tensor
288
 
 
275
  input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device)
276
  with torch.inference_mode():
277
  image_tensor = self.image_processor.preprocess(images, return_tensors="pt")["pixel_values"].half().to(self.device)
278
+ output_logits = self(input_ids.repeat(image_tensor.shape[0], 1),
279
  images=image_tensor)["logits"][:,-1, self.preferential_ids_]
280
  return torch.softmax(output_logits, -1) @ self.weight_tensor
281
  else:
282
  video = [[expand2square(frame, tuple(int(x*255) for x in self.image_processor.image_mean)) for frame in vid] for vid in images]
283
  with torch.inference_mode():
284
  video_tensors = [self.image_processor.preprocess(vid, return_tensors="pt")["pixel_values"].half().to(self.model.device) for vid in video]
285
+ output_logits = self(self.input_ids.repeat(len(video_tensors), 1),
286
  images=video_tensors)["logits"][:,-1, self.preferential_ids_]
287
  return torch.softmax(output_logits, -1) @ self.weight_tensor
288