hperkins
/

Qwen2-VL-7B-Instruct

Image-Text-to-Text

Inference Endpoints

Model card Files Files and versions Community

hperkins commited on Sep 12

Commit

e4524b0

•

1 Parent(s): fbc8418

Update handler.py

Files changed (1) hide show

handler.py +5 -3

handler.py CHANGED Viewed

@@ -9,7 +9,7 @@ import io
 from PIL import Image
 import logging
 import requests
-from moviepy.editor import VideoFileClip # For video frame extraction
 class EndpointHandler():
     def __init__(self, path=""):
@@ -33,6 +33,7 @@ class EndpointHandler():
         # Construct the messages list from the input string
         messages = [{"role": "user", "content": self._parse_input(inputs)}]
         text = self.processor.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True
         )
@@ -47,6 +48,7 @@ class EndpointHandler():
         )
         inputs = inputs.to("cuda" if torch.cuda.is_available() else "cpu")
         generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
         generated_ids_trimmed = [
             out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
@@ -65,11 +67,11 @@ class EndpointHandler():
             if i % 2 == 0:  # Text part
                 content.append({"type": "text", "text": part.strip()})
             else:  # Image/video part
-                if part.startswith("video:"):
                     video_path = part.split("video:")[1].strip()
                     video_frames = self._extract_video_frames(video_path)
                     if video_frames:
-                        content.append({"type": "video", "video": video_frames, "fps": 1})  # Add fps
                 else:
                     image = self._load_image(part.strip())
                     if image:

 from PIL import Image
 import logging
 import requests
+from moviepy.editor import VideoFileClip
 class EndpointHandler():
     def __init__(self, path=""):
         # Construct the messages list from the input string
         messages = [{"role": "user", "content": self._parse_input(inputs)}]
+        # Prepare for inference (using qwen_vl_utils)
         text = self.processor.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True
         )
         )
         inputs = inputs.to("cuda" if torch.cuda.is_available() else "cpu")
+        # Inference
         generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
         generated_ids_trimmed = [
             out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
             if i % 2 == 0:  # Text part
                 content.append({"type": "text", "text": part.strip()})
             else:  # Image/video part
+                if part.lower().startswith("video:"):
                     video_path = part.split("video:")[1].strip()
                     video_frames = self._extract_video_frames(video_path)
                     if video_frames:
+                        content.append({"type": "video", "video": video_frames, "fps": 1})
                 else:
                     image = self._load_image(part.strip())
                     if image: