q-future
/

one-align

@@ -1,4 +1,4 @@
-#    Copyright 2023 Haotian Liu & Qinghao Ye (Modified from LLaVA)
 #
 #    Licensed under the Apache License, Version 2.0 (the "License");
 #    you may not use this file except in compliance with the License.
@@ -271,20 +271,23 @@ class MPLUGOwl2LlamaForCausalLM(LlamaForCausalLM, MPLUGOwl2MetaForCausalLM):
               task_: str = "quality",
               input_: str = "image",
               return_dict=False,
              ):
         if not hasattr(self, "weight_tensor"):
             self.weight_tensor = torch.Tensor([5.,4.,3.,2.,1.]).half().to(self.device)
         prompt = "USER: How would you rate the {} of this {}?\n<|image|>\nASSISTANT: The {} of the {} is".format(task_, input_, task_, input_)
         if input_ == "image":
-            images = [expand2square(img, tuple(int(x*255) for x in self.image_processor.image_mean)) for img in images]
-            input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device)
-            with torch.inference_mode():
                 image_tensor = self.image_processor.preprocess(images, return_tensors="pt")["pixel_values"].half().to(self.device)
                 output_logits = self(input_ids.repeat(image_tensor.shape[0], 1),
                                 images=image_tensor)["logits"][:,-1, self.preferential_ids_]
                 if return_dict:
                     return {"logits": output_logits, "scores": torch.softmax(output_logits, -1) @ self.weight_tensor}
                 return torch.softmax(output_logits, -1) @ self.weight_tensor
         else:
             video = [[expand2square(frame, tuple(int(x*255) for x in self.image_processor.image_mean)) for frame in vid] for vid in images]
             input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device)

+#    Copyright 2023 Haotian Liu & Qinghao Ye & Haoning Wu (Modified from LLaVA, and mPLUG-Owl2)
 #
 #    Licensed under the Apache License, Version 2.0 (the "License");
 #    you may not use this file except in compliance with the License.
               task_: str = "quality",
               input_: str = "image",
               return_dict=False,
+              image_tensor = None,
              ):
         if not hasattr(self, "weight_tensor"):
             self.weight_tensor = torch.Tensor([5.,4.,3.,2.,1.]).half().to(self.device)
         prompt = "USER: How would you rate the {} of this {}?\n<|image|>\nASSISTANT: The {} of the {} is".format(task_, input_, task_, input_)
         if input_ == "image":
+            if image_tensor is None:
+                images = [expand2square(img, tuple(int(x*255) for x in self.image_processor.image_mean)) for img in images]
+                input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device)
                 image_tensor = self.image_processor.preprocess(images, return_tensors="pt")["pixel_values"].half().to(self.device)
+            with torch.inference_mode():
                 output_logits = self(input_ids.repeat(image_tensor.shape[0], 1),
                                 images=image_tensor)["logits"][:,-1, self.preferential_ids_]
                 if return_dict:
                     return {"logits": output_logits, "scores": torch.softmax(output_logits, -1) @ self.weight_tensor}
                 return torch.softmax(output_logits, -1) @ self.weight_tensor
         else:
             video = [[expand2square(frame, tuple(int(x*255) for x in self.image_processor.image_mean)) for frame in vid] for vid in images]
             input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device)