KingNish commited on
Commit
7e1365b
1 Parent(s): 1ac43cd

Added Video Support

Browse files
Files changed (1) hide show
  1. app.py +31 -73
app.py CHANGED
@@ -5,100 +5,59 @@ from qwen_vl_utils import process_vision_info
5
  import torch
6
  from PIL import Image
7
  import subprocess
8
- from datetime import datetime
9
  import numpy as np
10
  import os
11
 
 
 
12
 
13
- # subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
14
-
15
- # models = {
16
- # "Qwen/Qwen2-VL-7B-Instruct": AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", trust_remote_code=True, torch_dtype="auto", _attn_implementation="flash_attention_2").cuda().eval()
17
-
18
- # }
19
- def array_to_image_path(image_array):
20
- if image_array is None:
21
- raise ValueError("No image provided. Please upload an image before submitting.")
22
- # Convert numpy array to PIL Image
23
- img = Image.fromarray(np.uint8(image_array))
24
-
25
- # Generate a unique filename using timestamp
26
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
27
- filename = f"image_{timestamp}.png"
28
-
29
- # Save the image
30
- img.save(filename)
31
-
32
- # Get the full path of the saved image
33
- full_path = os.path.abspath(filename)
34
-
35
- return full_path
36
-
37
- models = {
38
- "Qwen/Qwen2-VL-7B-Instruct": Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", trust_remote_code=True, torch_dtype="auto").cuda().eval()
39
-
40
- }
41
-
42
- processors = {
43
- "Qwen/Qwen2-VL-7B-Instruct": AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", trust_remote_code=True)
44
- }
45
 
46
  DESCRIPTION = "[Qwen2-VL-7B Demo](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)"
47
 
48
- kwargs = {}
49
- kwargs['torch_dtype'] = torch.bfloat16
50
-
51
- user_prompt = '<|user|>\n'
52
- assistant_prompt = '<|assistant|>\n'
53
- prompt_suffix = "<|end|>\n"
54
-
55
  @spaces.GPU
56
- def run_example(image, text_input=None, model_id="Qwen/Qwen2-VL-7B-Instruct"):
57
- image_path = array_to_image_path(image)
58
-
59
- print(image_path)
60
- model = models[model_id]
61
- processor = processors[model_id]
62
 
63
- prompt = f"{user_prompt}<|image_1|>\n{text_input}{prompt_suffix}{assistant_prompt}"
64
- image = Image.fromarray(image).convert("RGB")
 
 
 
 
 
 
65
  messages = [
66
- {
67
  "role": "user",
68
  "content": [
69
  {
70
- "type": "image",
71
- "image": image_path,
 
72
  },
73
  {"type": "text", "text": text_input},
74
  ],
75
  }
76
  ]
77
-
78
- # Preparation for inference
79
- text = processor.apply_chat_template(
80
- messages, tokenize=False, add_generation_prompt=True
81
- )
82
- image_inputs, video_inputs = process_vision_info(messages)
83
  inputs = processor(
84
  text=[text],
85
  images=image_inputs,
86
  videos=video_inputs,
87
  padding=True,
88
  return_tensors="pt",
89
- )
90
- inputs = inputs.to("cuda")
91
-
92
- # Inference: Generation of the output
93
  generated_ids = model.generate(**inputs, max_new_tokens=1024)
94
- generated_ids_trimmed = [
95
- out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
96
- ]
97
- output_text = processor.batch_decode(
98
- generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
99
- )
100
 
101
- return output_text[0]
102
 
103
  css = """
104
  #output {
@@ -110,17 +69,16 @@ css = """
110
 
111
  with gr.Blocks(css=css) as demo:
112
  gr.Markdown(DESCRIPTION)
113
- with gr.Tab(label="Qwen2-VL-7B Input"):
 
114
  with gr.Row():
115
  with gr.Column():
116
- input_img = gr.Image(label="Input Picture")
117
- model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="Qwen/Qwen2-VL-7B-Instruct")
118
  text_input = gr.Textbox(label="Question")
119
  submit_btn = gr.Button(value="Submit")
120
  with gr.Column():
121
  output_text = gr.Textbox(label="Output Text")
122
 
123
- submit_btn.click(run_example, [input_img, text_input, model_selector], [output_text])
124
 
125
- demo.queue(api_open=False)
126
  demo.launch(debug=True)
 
5
  import torch
6
  from PIL import Image
7
  import subprocess
 
8
  import numpy as np
9
  import os
10
 
11
+ # Install flash-attn
12
+ subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
13
 
14
+ # Model and Processor Loading (Done once at startup)
15
+ MODEL_ID = "Qwen/Qwen2-VL-7B-Instruct"
16
+ model = Qwen2VLForConditionalGeneration.from_pretrained(MODEL_ID, trust_remote_code=True, torch_dtype=torch.float16).to("cuda").eval()
17
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  DESCRIPTION = "[Qwen2-VL-7B Demo](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)"
20
 
 
 
 
 
 
 
 
21
  @spaces.GPU
22
+ def qwen_inference(media_path, text_input=None):
 
 
 
 
 
23
 
24
+ image_extensions = Image.registered_extensions()
25
+ if media_path.endswith(tuple([i for i, f in image_extensions.items()])):
26
+ media_type = "image"
27
+ elif media_path.endswith(("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg", "wav", "gif", "webm", "m4v", "3gp")): # Check if it's a video path
28
+ media_type = "video"
29
+ else:
30
+ raise ValueError("Unsupported media type. Please upload an image or video.")
31
+
32
  messages = [
33
+ {
34
  "role": "user",
35
  "content": [
36
  {
37
+ "type": media_type,
38
+ media_type: media_path,
39
+ **({"fps": 8.0} if media_type == "video" else {}),
40
  },
41
  {"type": "text", "text": text_input},
42
  ],
43
  }
44
  ]
45
+
46
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
47
+ image_inputs, video_inputs = process_vision_info(messages)
 
 
 
48
  inputs = processor(
49
  text=[text],
50
  images=image_inputs,
51
  videos=video_inputs,
52
  padding=True,
53
  return_tensors="pt",
54
+ ).to("cuda")
55
+
 
 
56
  generated_ids = model.generate(**inputs, max_new_tokens=1024)
57
+ generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
58
+ output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 
 
 
 
59
 
60
+ return output_text
61
 
62
  css = """
63
  #output {
 
69
 
70
  with gr.Blocks(css=css) as demo:
71
  gr.Markdown(DESCRIPTION)
72
+
73
+ with gr.Tab(label="Image/Video Input"):
74
  with gr.Row():
75
  with gr.Column():
76
+ input_media = gr.File(label="Upload Image or Video", type="filepath")
 
77
  text_input = gr.Textbox(label="Question")
78
  submit_btn = gr.Button(value="Submit")
79
  with gr.Column():
80
  output_text = gr.Textbox(label="Output Text")
81
 
82
+ submit_btn.click(qwen_inference, [input_media, text_input], [output_text])
83
 
 
84
  demo.launch(debug=True)