Qwen
/

Qwen2-VL-7B-Instruct

@@ -47,7 +47,7 @@ We have three models with 2, 7 and 72 billion parameters. This repo contains the
 | MMMU<sub>val</sub>  | 51.8 | 49.8 | **60**| 54.1 |
 | DocVQA<sub>test</sub>  | 91.6 | 90.8 | - | **94.5** |
 | InfoVQA<sub>test</sub>  | 74.8 | - |  - |**76.5** |
-| ChartQA<sub>test</sub>  | 83.3 | - |**85.7**| 83.0 |
 | TextVQA<sub>val</sub>  | 77.4 | 80.1 | -| **84.3** |
 | OCRBench | 794 | **852** | 785 | 845 |
 | MTVQA | - | -  | -| **26.3** |
@@ -128,7 +128,7 @@ output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=
 print(output_text)
 ```
 <details>
-<summary>without qwen_vl_utils</summary>
 ```python
@@ -177,6 +177,72 @@ output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, cl
 print(output_text)
 ```
 </details>
 ### More Usage Tips

 | MMMU<sub>val</sub>  | 51.8 | 49.8 | **60**| 54.1 |
 | DocVQA<sub>test</sub>  | 91.6 | 90.8 | - | **94.5** |
 | InfoVQA<sub>test</sub>  | 74.8 | - |  - |**76.5** |
+| ChartQA<sub>test</sub>  | **83.3** | - |- | 83.0 |
 | TextVQA<sub>val</sub>  | 77.4 | 80.1 | -| **84.3** |
 | OCRBench | 794 | **852** | 785 | 845 |
 | MTVQA | - | -  | -| **26.3** |
 print(output_text)
 ```
 <details>
+<summary>Without qwen_vl_utils</summary>
 ```python
 print(output_text)
 ```
 </details>
+<details>
+<summary>Multi image inference</summary>
+```python
+# Messages containing multiple images and a text query
+messages = [{"role": "user", "content": [{"type": "image", "image": "file:///path/to/image1.jpg"}, {"type": "image", "image": "file:///path/to/image2.jpg"}, {"type": "text", "text": "Identify the similarities between these images."}]}]
+# Preparation for inference
+text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+image_inputs, video_inputs = process_vision_info(messages)
+inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
+# Inference
+generated_ids = model.generate(**inputs, max_new_tokens=128)
+generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
+output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+print(output_text)
+```
+</details>
+<details>
+<summary>Video inference</summary>
+```python
+# Messages containing a images list as a video and a text query
+messages = [{"role": "user", "content": [{"type": "video", "video": ["file:///path/to/frame1.jpg", "file:///path/to/frame2.jpg", "file:///path/to/frame3.jpg", "file:///path/to/frame4.jpg"], 'fps': 1.0}, {"type": "text", "text": "Describe this video."}]}]
+# Messages containing a video and a text query
+messages = [{"role": "user", "content": [{"type": "video", "video": "file:///path/to/video1.mp4", 'max_pixels': 360*420, 'fps': 1.0}, {"type": "text", "text": "Describe this video."}]}]
+# Preparation for inference
+text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+image_inputs, video_inputs = process_vision_info(messages)
+inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
+# Inference
+generated_ids = model.generate(**inputs, max_new_tokens=128)
+generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
+output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+print(output_text)
+```
+</details>
+<details>
+<summary>Batch inference</summary>
+```python
+# Sample messages for batch inference
+messages1 = [{"role": "user", "content": [{"type": "image", "image": "file:///path/to/image1.jpg"}, {"type": "image", "image": "file:///path/to/image2.jpg"}, {"type": "text", "text": "What are the common elements in these pictures?"}]}]
+messages2 = [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Who are you?"}]
+# Combine messages for batch processing
+messages = [messages1, messages1]
+# Preparation for batch inference
+texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages]
+image_inputs, video_inputs = process_vision_info(messages)
+inputs = processor(text=texts, images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
+# Batch Inference
+generated_ids = model.generate(**inputs, max_new_tokens=128)
+generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
+output_texts = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+print(output_texts)
+```
+</details>
 ### More Usage Tips