Update README.md
Browse files
README.md
CHANGED
@@ -129,7 +129,7 @@ output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=
|
|
129 |
print(output_text)
|
130 |
```
|
131 |
<details>
|
132 |
-
<summary>
|
133 |
|
134 |
```python
|
135 |
|
@@ -179,6 +179,73 @@ print(output_text)
|
|
179 |
```
|
180 |
</details>
|
181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
### More Usage Tips
|
183 |
|
184 |
For input images, we support local files, base64, and URLs. For videos, we currently only support local files.
|
|
|
129 |
print(output_text)
|
130 |
```
|
131 |
<details>
|
132 |
+
<summary>Without qwen_vl_utils</summary>
|
133 |
|
134 |
```python
|
135 |
|
|
|
179 |
```
|
180 |
</details>
|
181 |
|
182 |
+
<details>
|
183 |
+
<summary>Multi image inference</summary>
|
184 |
+
|
185 |
+
```python
|
186 |
+
# Messages containing multiple images and a text query
|
187 |
+
messages = [{"role": "user", "content": [{"type": "image", "image": "file:///path/to/image1.jpg"}, {"type": "image", "image": "file:///path/to/image2.jpg"}, {"type": "text", "text": "Identify the similarities between these images."}]}]
|
188 |
+
|
189 |
+
# Preparation for inference
|
190 |
+
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
191 |
+
image_inputs, video_inputs = process_vision_info(messages)
|
192 |
+
inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
|
193 |
+
|
194 |
+
# Inference
|
195 |
+
generated_ids = model.generate(**inputs, max_new_tokens=128)
|
196 |
+
generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
|
197 |
+
output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
198 |
+
print(output_text)
|
199 |
+
```
|
200 |
+
</details>
|
201 |
+
|
202 |
+
<details>
|
203 |
+
<summary>Video inference</summary>
|
204 |
+
|
205 |
+
```python
|
206 |
+
|
207 |
+
# Messages containing a images list as a video and a text query
|
208 |
+
messages = [{"role": "user", "content": [{"type": "video", "video": ["file:///path/to/frame1.jpg", "file:///path/to/frame2.jpg", "file:///path/to/frame3.jpg", "file:///path/to/frame4.jpg"], 'fps': 1.0}, {"type": "text", "text": "Describe this video."}]}]
|
209 |
+
# Messages containing a video and a text query
|
210 |
+
messages = [{"role": "user", "content": [{"type": "video", "video": "file:///path/to/video1.mp4", 'max_pixels': 360*420, 'fps': 1.0}, {"type": "text", "text": "Describe this video."}]}]
|
211 |
+
|
212 |
+
# Preparation for inference
|
213 |
+
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
214 |
+
image_inputs, video_inputs = process_vision_info(messages)
|
215 |
+
inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
|
216 |
+
|
217 |
+
# Inference
|
218 |
+
generated_ids = model.generate(**inputs, max_new_tokens=128)
|
219 |
+
generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
|
220 |
+
output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
221 |
+
print(output_text)
|
222 |
+
```
|
223 |
+
</details>
|
224 |
+
|
225 |
+
<details>
|
226 |
+
<summary>Batch inference</summary>
|
227 |
+
|
228 |
+
```python
|
229 |
+
|
230 |
+
# Sample messages for batch inference
|
231 |
+
messages1 = [{"role": "user", "content": [{"type": "image", "image": "file:///path/to/image1.jpg"}, {"type": "image", "image": "file:///path/to/image2.jpg"}, {"type": "text", "text": "What are the common elements in these pictures?"}]}]
|
232 |
+
messages2 = [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Who are you?"}]
|
233 |
+
# Combine messages for batch processing
|
234 |
+
messages = [messages1, messages1]
|
235 |
+
|
236 |
+
# Preparation for batch inference
|
237 |
+
texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages]
|
238 |
+
image_inputs, video_inputs = process_vision_info(messages)
|
239 |
+
inputs = processor(text=texts, images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
|
240 |
+
|
241 |
+
# Batch Inference
|
242 |
+
generated_ids = model.generate(**inputs, max_new_tokens=128)
|
243 |
+
generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
|
244 |
+
output_texts = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
245 |
+
print(output_texts)
|
246 |
+
```
|
247 |
+
</details>
|
248 |
+
|
249 |
### More Usage Tips
|
250 |
|
251 |
For input images, we support local files, base64, and URLs. For videos, we currently only support local files.
|