Spaces:
Running
on
Zero
Running
on
Zero
File size: 2,956 Bytes
8f558df 21fcfe6 3890132 0e31dfe 21fcfe6 8f558df 21fcfe6 a533ef3 425e364 a533ef3 7e1365b 02558d9 7e1365b 21fcfe6 d0c2593 21fcfe6 7e1365b 21fcfe6 7e1365b dcf6d05 7e1365b dcf6d05 7e1365b dcf6d05 13775ff dcf6d05 7e1365b dcf6d05 7e1365b 2406bfd 7e1365b dcf6d05 7e1365b 8f558df 7e1365b 8f558df 7e1365b 8f558df 21fcfe6 7e1365b 8f558df 755339c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import gradio as gr
import spaces
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
from PIL import Image
import subprocess
import numpy as np
import os
# Install flash-attn
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
# Model and Processor Loading (Done once at startup)
MODEL_ID = "Qwen/Qwen2-VL-7B-Instruct"
model = Qwen2VLForConditionalGeneration.from_pretrained(MODEL_ID, trust_remote_code=True, torch_dtype=torch.float16).to("cuda").eval()
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
DESCRIPTION = "[Qwen2-VL-7B Demo](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)"
@spaces.GPU
def qwen_inference(media_path, text_input=None):
image_extensions = Image.registered_extensions()
if media_path.endswith(tuple([i for i, f in image_extensions.items()])):
media_type = "image"
elif media_path.endswith(("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg", "wav", "gif", "webm", "m4v", "3gp")): # Check if it's a video path
media_type = "video"
else:
raise ValueError("Unsupported media type. Please upload an image or video.")
messages = [
{
"role": "user",
"content": [
{
"type": media_type,
media_type: media_path,
**({"fps": 8.0} if media_type == "video" else {}),
},
{"type": "text", "text": text_input},
],
}
]
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
).to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=1024)
generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
return output_text
css = """
#output {
height: 500px;
overflow: auto;
border: 1px solid #ccc;
}
"""
with gr.Blocks(css=css) as demo:
gr.Markdown(DESCRIPTION)
with gr.Tab(label="Image/Video Input"):
with gr.Row():
with gr.Column():
input_media = gr.File(label="Upload Image or Video", type="filepath")
text_input = gr.Textbox(label="Question")
submit_btn = gr.Button(value="Submit")
with gr.Column():
output_text = gr.Textbox(label="Output Text")
submit_btn.click(qwen_inference, [input_media, text_input], [output_text])
demo.launch(debug=True) |