andrewqian123
/

LLAMA_BATCH

@@ -274,7 +274,7 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
     def chat(
         self,
-        image,
         msgs,
         tokenizer,
         processor=None,
@@ -290,42 +290,45 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
             processor = AutoProcessor.from_pretrained(self.config._name_or_path, trust_remote_code=True)
         if isinstance(msgs, str):
             msgs = json.loads(msgs)
-        copy_msgs = deepcopy(msgs)
         assert len(msgs) > 0, "msgs is empty"
         assert sampling or not stream, "if use stream mode, make sure sampling=True"
-        if image is not None and isinstance(copy_msgs[0]["content"], str):
-            # copy_msgs[0]['content'] = '(<image>./</image>)\n' + copy_msgs[0]['content']
-            #copy_msgs[0]["content"] = [image, copy_msgs[0]["content"]]
-            #copy_msgs[0]["content"] = [image, copy_msgs[0]["content"]]
-            for im in image:
-                copy_msgs[0]["content"] = [copy_msgs[0]["content"]]
-                copy_msgs[0]["content"].insert(-1, im)
-        images = []
-        for i, msg in enumerate(copy_msgs):
-            role = msg["role"]
-            content = msg["content"]
-            assert role in ["user", "assistant"]
-            if i == 0:
-                assert role == "user", "The role of first msg should be user"
-            if isinstance(content, str):
-                content = [content]
-            cur_msgs = []
-            for c in content:
-                if isinstance(c, Image.Image):
-                    images.append(c)
-                    cur_msgs.append("(<image>./</image>)")
-                elif isinstance(c, str):
-                    cur_msgs.append(c)
-            msg["content"] = "\n".join(cur_msgs)
-        if system_prompt:
-            sys_msg = {'role': 'system', 'content': system_prompt}
-            copy_msgs = [sys_msg] + copy_msgs
-        prompt = processor.tokenizer.apply_chat_template(copy_msgs, tokenize=False, add_generation_prompt=True)
-        inputs = processor(prompt, images, return_tensors="pt", max_length=max_inp_length).to(self.device)
         if sampling:
             generation_config = {

     def chat(
         self,
+        images,
         msgs,
         tokenizer,
         processor=None,
             processor = AutoProcessor.from_pretrained(self.config._name_or_path, trust_remote_code=True)
         if isinstance(msgs, str):
             msgs = json.loads(msgs)
+        # copy_msgs = deepcopy(msgs)
         assert len(msgs) > 0, "msgs is empty"
         assert sampling or not stream, "if use stream mode, make sure sampling=True"
+        assert(len(msgs) == len(images)), "Make sure to have one image per item in your batch"
+        batchM = []
+        batchI = []
+        for ind in range(len(images)):
+            image = images[ind]
+            if image is not None and isinstance(copy_msgs[0]["content"], str):
+                # deep copy element
+                copy_msgs = deepcopy(msgs[ind])
+                copy_msgs[0]["content"] = [image, copy_msgs[0]["content"]]
+            imagelist = []
+            for i, msg in enumerate(copy_msgs):
+                role = msg["role"]
+                content = msg["content"]
+                assert role in ["user", "assistant"]
+                if i == 0:
+                    assert role == "user", "The role of first msg should be user"
+                if isinstance(content, str):
+                    content = [content]
+                cur_msgs = []
+                for c in content:
+                    if isinstance(c, Image.Image):
+                        imagelist.append(c)
+                        cur_msgs.append("(<image>./</image>)")
+                    elif isinstance(c, str):
+                        cur_msgs.append(c)
+                msg["content"] = "\n".join(cur_msgs)
+            if system_prompt:
+                sys_msg = {'role': 'system', 'content': system_prompt}
+                copy_msgs = [sys_msg] + copy_msgs
+            batchM.append(copy_msgs)
+            batchI.append(imagelist)
+        prompt = processor.tokenizer.apply_chat_template(batchM, tokenize=False, add_generation_prompt=True)
+        inputs = processor(prompt, batchI, return_tensors="pt", max_length=max_inp_length).to(self.device)
         if sampling:
             generation_config = {