Spaces:
Running
on
A10G
Running
on
A10G
Commit
β’
6a059b5
1
Parent(s):
912128d
initial commit
Browse files- .gitattributes +1 -0
- README copy.md +15 -0
- app.py +217 -0
- newyorkstreets_small.mp4 +3 -0
- output.ts +0 -0
- requirements.txt +8 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
newyorkstreets_small.mp4 filter=lfs diff=lfs merge=lfs -text
|
README copy.md
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Omdet Turbo Open Vocabulary
|
3 |
+
emoji: πΉ
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: blue
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 4.42.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: apache-2.0
|
11 |
+
short_description: Video captioning/open-vocabulary/zero-shot
|
12 |
+
|
13 |
+
---
|
14 |
+
|
15 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import time
|
3 |
+
|
4 |
+
import cv2
|
5 |
+
import gradio as gr
|
6 |
+
import numpy as np
|
7 |
+
import spaces
|
8 |
+
import supervision as sv
|
9 |
+
import torch
|
10 |
+
|
11 |
+
from transformers import AutoModelForZeroShotObjectDetection, AutoProcessor
|
12 |
+
|
13 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
14 |
+
|
15 |
+
processor = AutoProcessor.from_pretrained("yonigozlan/omdet-turbo-tiny")
|
16 |
+
model = AutoModelForZeroShotObjectDetection.from_pretrained(
|
17 |
+
"yonigozlan/omdet-turbo-tiny"
|
18 |
+
).to(device)
|
19 |
+
|
20 |
+
css = """
|
21 |
+
.feedback textarea {font-size: 24px !important}
|
22 |
+
"""
|
23 |
+
|
24 |
+
global classes
|
25 |
+
global detections
|
26 |
+
global labels
|
27 |
+
global threshold
|
28 |
+
classes = "person, bike, car"
|
29 |
+
detections = None
|
30 |
+
labels = None
|
31 |
+
threshold = 0.2
|
32 |
+
|
33 |
+
BOUNDING_BOX_ANNOTATOR = sv.BoundingBoxAnnotator()
|
34 |
+
MASK_ANNOTATOR = sv.MaskAnnotator()
|
35 |
+
LABEL_ANNOTATOR = sv.LabelAnnotator()
|
36 |
+
SUBSAMPLE = 2
|
37 |
+
|
38 |
+
|
39 |
+
def calculate_end_frame_index(source_video_path):
|
40 |
+
video_info = sv.VideoInfo.from_video_path(source_video_path)
|
41 |
+
return min(video_info.total_frames, video_info.fps * 5)
|
42 |
+
|
43 |
+
|
44 |
+
def annotate_image(input_image, detections, labels) -> np.ndarray:
|
45 |
+
output_image = MASK_ANNOTATOR.annotate(input_image, detections)
|
46 |
+
output_image = BOUNDING_BOX_ANNOTATOR.annotate(output_image, detections)
|
47 |
+
output_image = LABEL_ANNOTATOR.annotate(output_image, detections, labels=labels)
|
48 |
+
return output_image
|
49 |
+
|
50 |
+
|
51 |
+
@spaces.GPU
|
52 |
+
def process_video(
|
53 |
+
input_video,
|
54 |
+
confidence_threshold,
|
55 |
+
classes_new,
|
56 |
+
progress=gr.Progress(track_tqdm=True),
|
57 |
+
):
|
58 |
+
global detections
|
59 |
+
global labels
|
60 |
+
global classes
|
61 |
+
global threshold
|
62 |
+
classes = classes_new
|
63 |
+
threshold = confidence_threshold
|
64 |
+
result_file_name = "output.mp4"
|
65 |
+
result_file_path = os.path.join(os.getcwd(), result_file_name)
|
66 |
+
batch_fps = []
|
67 |
+
|
68 |
+
cap = cv2.VideoCapture(input_video)
|
69 |
+
|
70 |
+
video_codec = cv2.VideoWriter_fourcc(*"mp4v") # type: ignore
|
71 |
+
fps = int(cap.get(cv2.CAP_PROP_FPS))
|
72 |
+
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
73 |
+
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
74 |
+
desired_fps = fps // SUBSAMPLE
|
75 |
+
iterating, frame = cap.read()
|
76 |
+
segment_file = cv2.VideoWriter(
|
77 |
+
result_file_path, video_codec, desired_fps, (width, height)
|
78 |
+
) # type: ignore
|
79 |
+
batch = []
|
80 |
+
frames = []
|
81 |
+
predict_index = []
|
82 |
+
n_frames = 0
|
83 |
+
while iterating:
|
84 |
+
# frame = cv2.resize(frame, (0, 0), fx=0.5, fy=0.5)
|
85 |
+
if n_frames % SUBSAMPLE == 0:
|
86 |
+
predict_index.append(len(frames))
|
87 |
+
batch.append(frame)
|
88 |
+
frames.append(frame)
|
89 |
+
if len(batch) == desired_fps:
|
90 |
+
classes_list = classes.strip(" ").split(",")
|
91 |
+
results, fps = query(batch, classes_list, threshold, (width, height))
|
92 |
+
for i in range(len(frames)):
|
93 |
+
if i in predict_index:
|
94 |
+
batch_index = predict_index.index(i)
|
95 |
+
detections = sv.Detections(
|
96 |
+
xyxy=results[batch_index]["boxes"].cpu().detach().numpy(),
|
97 |
+
confidence=results[batch_index]["scores"]
|
98 |
+
.cpu()
|
99 |
+
.detach()
|
100 |
+
.numpy(),
|
101 |
+
class_id=np.array(
|
102 |
+
[
|
103 |
+
classes_list.index(results_class)
|
104 |
+
for results_class in results[batch_index]["classes"]
|
105 |
+
]
|
106 |
+
),
|
107 |
+
data={"class_name": results[batch_index]["classes"]},
|
108 |
+
)
|
109 |
+
labels = results[batch_index]["classes"]
|
110 |
+
frame = annotate_image(
|
111 |
+
input_image=frames[i],
|
112 |
+
detections=detections,
|
113 |
+
labels=labels,
|
114 |
+
)
|
115 |
+
segment_file.write(frame)
|
116 |
+
segment_file.release()
|
117 |
+
yield (
|
118 |
+
result_file_path,
|
119 |
+
gr.Markdown(
|
120 |
+
f'<h3 style="text-align: center;">Model inference FPS: {fps*len(batch):.2f}</h3>',
|
121 |
+
visible=True,
|
122 |
+
),
|
123 |
+
)
|
124 |
+
segment_file = cv2.VideoWriter(
|
125 |
+
result_file_path, video_codec, desired_fps, (width, height)
|
126 |
+
) # type: ignore
|
127 |
+
batch = []
|
128 |
+
frames = []
|
129 |
+
predict_index = []
|
130 |
+
iterating, frame = cap.read()
|
131 |
+
n_frames += 1
|
132 |
+
|
133 |
+
|
134 |
+
def query(frame, classes, confidence_threshold, size=(640, 480)):
|
135 |
+
inputs = processor(
|
136 |
+
images=frame, text=[classes] * len(frame), return_tensors="pt"
|
137 |
+
).to(device)
|
138 |
+
with torch.no_grad():
|
139 |
+
start = time.time()
|
140 |
+
outputs = model(**inputs)
|
141 |
+
fps = 1 / (time.time() - start)
|
142 |
+
target_sizes = torch.tensor([size[::-1]] * len(frame))
|
143 |
+
|
144 |
+
results = processor.post_process_grounded_object_detection(
|
145 |
+
outputs=outputs,
|
146 |
+
classes=[classes] * len(frame),
|
147 |
+
score_threshold=confidence_threshold,
|
148 |
+
target_sizes=target_sizes,
|
149 |
+
)
|
150 |
+
return results, fps
|
151 |
+
|
152 |
+
|
153 |
+
def set_classes(classes_input):
|
154 |
+
global classes
|
155 |
+
classes = classes_input
|
156 |
+
|
157 |
+
|
158 |
+
def set_confidence_threshold(confidence_threshold_input):
|
159 |
+
global threshold
|
160 |
+
threshold = confidence_threshold_input
|
161 |
+
|
162 |
+
|
163 |
+
with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
|
164 |
+
gr.Markdown("## Real Time Open Vocabulary Object Detection with Omdet-Turbo")
|
165 |
+
gr.Markdown(
|
166 |
+
"""
|
167 |
+
This is a demo for real-time open vocabulary object detection using OmDet-Turbo.<br>
|
168 |
+
It runs on ZeroGPU which captures GPU every first time you infer.<br>
|
169 |
+
This combined with video processing time means that the demo inference time is slower than the model's actual inference time.<br>
|
170 |
+
The actual model average inference FPS is displayed under the processed video after inference.
|
171 |
+
"""
|
172 |
+
)
|
173 |
+
gr.Markdown(
|
174 |
+
"Simply upload a video or try the examples below π, and press run. You can then change the object detected live in the text box! You also play with the confidence threshold and see how it impacts the objects detected in real time."
|
175 |
+
)
|
176 |
+
|
177 |
+
with gr.Row():
|
178 |
+
with gr.Column():
|
179 |
+
input_video = gr.Video(label="Input Video")
|
180 |
+
with gr.Column():
|
181 |
+
output_video = gr.Video(label="Output Video", streaming=True, autoplay=True)
|
182 |
+
actual_fps = gr.Markdown("", visible=False)
|
183 |
+
with gr.Row():
|
184 |
+
classes = gr.Textbox(
|
185 |
+
"person, cat, dog",
|
186 |
+
label="Objects to detect. Change this as you like live!",
|
187 |
+
elem_classes="feedback",
|
188 |
+
scale=3,
|
189 |
+
)
|
190 |
+
conf = gr.Slider(
|
191 |
+
label="Confidence Threshold",
|
192 |
+
minimum=0.1,
|
193 |
+
maximum=1.0,
|
194 |
+
value=0.2,
|
195 |
+
step=0.05,
|
196 |
+
)
|
197 |
+
with gr.Row():
|
198 |
+
submit = gr.Button(variant="primary")
|
199 |
+
|
200 |
+
example = gr.Examples(
|
201 |
+
examples=[
|
202 |
+
["./newyorkstreets_small.mp4", 0.3, "person, car, shoe"],
|
203 |
+
],
|
204 |
+
inputs=[input_video, conf, classes],
|
205 |
+
outputs=[output_video, actual_fps],
|
206 |
+
)
|
207 |
+
classes.submit(set_classes, classes)
|
208 |
+
conf.change(set_confidence_threshold, conf)
|
209 |
+
|
210 |
+
submit.click(
|
211 |
+
fn=process_video,
|
212 |
+
inputs=[input_video, conf, classes],
|
213 |
+
outputs=[output_video, actual_fps],
|
214 |
+
)
|
215 |
+
|
216 |
+
if __name__ == "__main__":
|
217 |
+
demo.launch(show_error=True)
|
newyorkstreets_small.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5ffa9291f629cda40826435df1a6de7cd6e348af9edb51b22e2056ecc9b535ae
|
3 |
+
size 39231000
|
output.ts
ADDED
Binary file (434 kB). View file
|
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
timm
|
3 |
+
numpy==1.26.3
|
4 |
+
git+https://github.com/yonigozlan/transformers.git@add-om-det-turbo
|
5 |
+
opencv-python
|
6 |
+
supervision
|
7 |
+
gradio-client @ git+https://github.com/gradio-app/gradio@66349fe26827e3a3c15b738a1177e95fec7f5554#subdirectory=client/python
|
8 |
+
https://gradio-pypi-previews.s3.amazonaws.com/66349fe26827e3a3c15b738a1177e95fec7f5554/gradio-4.42.0-py3-none-any.whl
|