Update README.md
Browse files
README.md
CHANGED
@@ -47,7 +47,7 @@ We have three models with 2, 7 and 72 billion parameters. This repo contains the
|
|
47 |
| MMMU<sub>val</sub> | 51.8 | 49.8 | **60**| 54.1 |
|
48 |
| DocVQA<sub>test</sub> | 91.6 | 90.8 | - | **94.5** |
|
49 |
| InfoVQA<sub>test</sub> | 74.8 | - | - |**76.5** |
|
50 |
-
| ChartQA<sub>test</sub> | 83.3 | -
|
51 |
| TextVQA<sub>val</sub> | 77.4 | 80.1 | -| **84.3** |
|
52 |
| OCRBench | 794 | **852** | 785 | 845 |
|
53 |
| MTVQA | - | - | -| **26.3** |
|
@@ -128,7 +128,7 @@ output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=
|
|
128 |
print(output_text)
|
129 |
```
|
130 |
<details>
|
131 |
-
<summary>
|
132 |
|
133 |
```python
|
134 |
|
@@ -177,6 +177,72 @@ output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, cl
|
|
177 |
print(output_text)
|
178 |
```
|
179 |
</details>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
|
181 |
### More Usage Tips
|
182 |
|
|
|
47 |
| MMMU<sub>val</sub> | 51.8 | 49.8 | **60**| 54.1 |
|
48 |
| DocVQA<sub>test</sub> | 91.6 | 90.8 | - | **94.5** |
|
49 |
| InfoVQA<sub>test</sub> | 74.8 | - | - |**76.5** |
|
50 |
+
| ChartQA<sub>test</sub> | **83.3** | - |- | 83.0 |
|
51 |
| TextVQA<sub>val</sub> | 77.4 | 80.1 | -| **84.3** |
|
52 |
| OCRBench | 794 | **852** | 785 | 845 |
|
53 |
| MTVQA | - | - | -| **26.3** |
|
|
|
128 |
print(output_text)
|
129 |
```
|
130 |
<details>
|
131 |
+
<summary>Without qwen_vl_utils</summary>
|
132 |
|
133 |
```python
|
134 |
|
|
|
177 |
print(output_text)
|
178 |
```
|
179 |
</details>
|
180 |
+
<details>
|
181 |
+
<summary>Multi image inference</summary>
|
182 |
+
|
183 |
+
```python
|
184 |
+
# Messages containing multiple images and a text query
|
185 |
+
messages = [{"role": "user", "content": [{"type": "image", "image": "file:///path/to/image1.jpg"}, {"type": "image", "image": "file:///path/to/image2.jpg"}, {"type": "text", "text": "Identify the similarities between these images."}]}]
|
186 |
+
|
187 |
+
# Preparation for inference
|
188 |
+
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
189 |
+
image_inputs, video_inputs = process_vision_info(messages)
|
190 |
+
inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
|
191 |
+
|
192 |
+
# Inference
|
193 |
+
generated_ids = model.generate(**inputs, max_new_tokens=128)
|
194 |
+
generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
|
195 |
+
output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
196 |
+
print(output_text)
|
197 |
+
```
|
198 |
+
</details>
|
199 |
+
|
200 |
+
<details>
|
201 |
+
<summary>Video inference</summary>
|
202 |
+
|
203 |
+
```python
|
204 |
+
|
205 |
+
# Messages containing a images list as a video and a text query
|
206 |
+
messages = [{"role": "user", "content": [{"type": "video", "video": ["file:///path/to/frame1.jpg", "file:///path/to/frame2.jpg", "file:///path/to/frame3.jpg", "file:///path/to/frame4.jpg"], 'fps': 1.0}, {"type": "text", "text": "Describe this video."}]}]
|
207 |
+
# Messages containing a video and a text query
|
208 |
+
messages = [{"role": "user", "content": [{"type": "video", "video": "file:///path/to/video1.mp4", 'max_pixels': 360*420, 'fps': 1.0}, {"type": "text", "text": "Describe this video."}]}]
|
209 |
+
|
210 |
+
# Preparation for inference
|
211 |
+
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
212 |
+
image_inputs, video_inputs = process_vision_info(messages)
|
213 |
+
inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
|
214 |
+
|
215 |
+
# Inference
|
216 |
+
generated_ids = model.generate(**inputs, max_new_tokens=128)
|
217 |
+
generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
|
218 |
+
output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
219 |
+
print(output_text)
|
220 |
+
```
|
221 |
+
</details>
|
222 |
+
|
223 |
+
<details>
|
224 |
+
<summary>Batch inference</summary>
|
225 |
+
|
226 |
+
```python
|
227 |
+
|
228 |
+
# Sample messages for batch inference
|
229 |
+
messages1 = [{"role": "user", "content": [{"type": "image", "image": "file:///path/to/image1.jpg"}, {"type": "image", "image": "file:///path/to/image2.jpg"}, {"type": "text", "text": "What are the common elements in these pictures?"}]}]
|
230 |
+
messages2 = [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Who are you?"}]
|
231 |
+
# Combine messages for batch processing
|
232 |
+
messages = [messages1, messages1]
|
233 |
+
|
234 |
+
# Preparation for batch inference
|
235 |
+
texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages]
|
236 |
+
image_inputs, video_inputs = process_vision_info(messages)
|
237 |
+
inputs = processor(text=texts, images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
|
238 |
+
|
239 |
+
# Batch Inference
|
240 |
+
generated_ids = model.generate(**inputs, max_new_tokens=128)
|
241 |
+
generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
|
242 |
+
output_texts = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
243 |
+
print(output_texts)
|
244 |
+
```
|
245 |
+
</details>
|
246 |
|
247 |
### More Usage Tips
|
248 |
|