yonigozlan HF staff commited on
Commit
6a059b5
β€’
1 Parent(s): 912128d

initial commit

Browse files
Files changed (6) hide show
  1. .gitattributes +1 -0
  2. README copy.md +15 -0
  3. app.py +217 -0
  4. newyorkstreets_small.mp4 +3 -0
  5. output.ts +0 -0
  6. requirements.txt +8 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ newyorkstreets_small.mp4 filter=lfs diff=lfs merge=lfs -text
README copy.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Omdet Turbo Open Vocabulary
3
+ emoji: πŸ“Ή
4
+ colorFrom: red
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 4.42.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ short_description: Video captioning/open-vocabulary/zero-shot
12
+
13
+ ---
14
+
15
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+
4
+ import cv2
5
+ import gradio as gr
6
+ import numpy as np
7
+ import spaces
8
+ import supervision as sv
9
+ import torch
10
+
11
+ from transformers import AutoModelForZeroShotObjectDetection, AutoProcessor
12
+
13
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14
+
15
+ processor = AutoProcessor.from_pretrained("yonigozlan/omdet-turbo-tiny")
16
+ model = AutoModelForZeroShotObjectDetection.from_pretrained(
17
+ "yonigozlan/omdet-turbo-tiny"
18
+ ).to(device)
19
+
20
+ css = """
21
+ .feedback textarea {font-size: 24px !important}
22
+ """
23
+
24
+ global classes
25
+ global detections
26
+ global labels
27
+ global threshold
28
+ classes = "person, bike, car"
29
+ detections = None
30
+ labels = None
31
+ threshold = 0.2
32
+
33
+ BOUNDING_BOX_ANNOTATOR = sv.BoundingBoxAnnotator()
34
+ MASK_ANNOTATOR = sv.MaskAnnotator()
35
+ LABEL_ANNOTATOR = sv.LabelAnnotator()
36
+ SUBSAMPLE = 2
37
+
38
+
39
+ def calculate_end_frame_index(source_video_path):
40
+ video_info = sv.VideoInfo.from_video_path(source_video_path)
41
+ return min(video_info.total_frames, video_info.fps * 5)
42
+
43
+
44
+ def annotate_image(input_image, detections, labels) -> np.ndarray:
45
+ output_image = MASK_ANNOTATOR.annotate(input_image, detections)
46
+ output_image = BOUNDING_BOX_ANNOTATOR.annotate(output_image, detections)
47
+ output_image = LABEL_ANNOTATOR.annotate(output_image, detections, labels=labels)
48
+ return output_image
49
+
50
+
51
+ @spaces.GPU
52
+ def process_video(
53
+ input_video,
54
+ confidence_threshold,
55
+ classes_new,
56
+ progress=gr.Progress(track_tqdm=True),
57
+ ):
58
+ global detections
59
+ global labels
60
+ global classes
61
+ global threshold
62
+ classes = classes_new
63
+ threshold = confidence_threshold
64
+ result_file_name = "output.mp4"
65
+ result_file_path = os.path.join(os.getcwd(), result_file_name)
66
+ batch_fps = []
67
+
68
+ cap = cv2.VideoCapture(input_video)
69
+
70
+ video_codec = cv2.VideoWriter_fourcc(*"mp4v") # type: ignore
71
+ fps = int(cap.get(cv2.CAP_PROP_FPS))
72
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
73
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
74
+ desired_fps = fps // SUBSAMPLE
75
+ iterating, frame = cap.read()
76
+ segment_file = cv2.VideoWriter(
77
+ result_file_path, video_codec, desired_fps, (width, height)
78
+ ) # type: ignore
79
+ batch = []
80
+ frames = []
81
+ predict_index = []
82
+ n_frames = 0
83
+ while iterating:
84
+ # frame = cv2.resize(frame, (0, 0), fx=0.5, fy=0.5)
85
+ if n_frames % SUBSAMPLE == 0:
86
+ predict_index.append(len(frames))
87
+ batch.append(frame)
88
+ frames.append(frame)
89
+ if len(batch) == desired_fps:
90
+ classes_list = classes.strip(" ").split(",")
91
+ results, fps = query(batch, classes_list, threshold, (width, height))
92
+ for i in range(len(frames)):
93
+ if i in predict_index:
94
+ batch_index = predict_index.index(i)
95
+ detections = sv.Detections(
96
+ xyxy=results[batch_index]["boxes"].cpu().detach().numpy(),
97
+ confidence=results[batch_index]["scores"]
98
+ .cpu()
99
+ .detach()
100
+ .numpy(),
101
+ class_id=np.array(
102
+ [
103
+ classes_list.index(results_class)
104
+ for results_class in results[batch_index]["classes"]
105
+ ]
106
+ ),
107
+ data={"class_name": results[batch_index]["classes"]},
108
+ )
109
+ labels = results[batch_index]["classes"]
110
+ frame = annotate_image(
111
+ input_image=frames[i],
112
+ detections=detections,
113
+ labels=labels,
114
+ )
115
+ segment_file.write(frame)
116
+ segment_file.release()
117
+ yield (
118
+ result_file_path,
119
+ gr.Markdown(
120
+ f'<h3 style="text-align: center;">Model inference FPS: {fps*len(batch):.2f}</h3>',
121
+ visible=True,
122
+ ),
123
+ )
124
+ segment_file = cv2.VideoWriter(
125
+ result_file_path, video_codec, desired_fps, (width, height)
126
+ ) # type: ignore
127
+ batch = []
128
+ frames = []
129
+ predict_index = []
130
+ iterating, frame = cap.read()
131
+ n_frames += 1
132
+
133
+
134
+ def query(frame, classes, confidence_threshold, size=(640, 480)):
135
+ inputs = processor(
136
+ images=frame, text=[classes] * len(frame), return_tensors="pt"
137
+ ).to(device)
138
+ with torch.no_grad():
139
+ start = time.time()
140
+ outputs = model(**inputs)
141
+ fps = 1 / (time.time() - start)
142
+ target_sizes = torch.tensor([size[::-1]] * len(frame))
143
+
144
+ results = processor.post_process_grounded_object_detection(
145
+ outputs=outputs,
146
+ classes=[classes] * len(frame),
147
+ score_threshold=confidence_threshold,
148
+ target_sizes=target_sizes,
149
+ )
150
+ return results, fps
151
+
152
+
153
+ def set_classes(classes_input):
154
+ global classes
155
+ classes = classes_input
156
+
157
+
158
+ def set_confidence_threshold(confidence_threshold_input):
159
+ global threshold
160
+ threshold = confidence_threshold_input
161
+
162
+
163
+ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
164
+ gr.Markdown("## Real Time Open Vocabulary Object Detection with Omdet-Turbo")
165
+ gr.Markdown(
166
+ """
167
+ This is a demo for real-time open vocabulary object detection using OmDet-Turbo.<br>
168
+ It runs on ZeroGPU which captures GPU every first time you infer.<br>
169
+ This combined with video processing time means that the demo inference time is slower than the model's actual inference time.<br>
170
+ The actual model average inference FPS is displayed under the processed video after inference.
171
+ """
172
+ )
173
+ gr.Markdown(
174
+ "Simply upload a video or try the examples below πŸ‘‡, and press run. You can then change the object detected live in the text box! You also play with the confidence threshold and see how it impacts the objects detected in real time."
175
+ )
176
+
177
+ with gr.Row():
178
+ with gr.Column():
179
+ input_video = gr.Video(label="Input Video")
180
+ with gr.Column():
181
+ output_video = gr.Video(label="Output Video", streaming=True, autoplay=True)
182
+ actual_fps = gr.Markdown("", visible=False)
183
+ with gr.Row():
184
+ classes = gr.Textbox(
185
+ "person, cat, dog",
186
+ label="Objects to detect. Change this as you like live!",
187
+ elem_classes="feedback",
188
+ scale=3,
189
+ )
190
+ conf = gr.Slider(
191
+ label="Confidence Threshold",
192
+ minimum=0.1,
193
+ maximum=1.0,
194
+ value=0.2,
195
+ step=0.05,
196
+ )
197
+ with gr.Row():
198
+ submit = gr.Button(variant="primary")
199
+
200
+ example = gr.Examples(
201
+ examples=[
202
+ ["./newyorkstreets_small.mp4", 0.3, "person, car, shoe"],
203
+ ],
204
+ inputs=[input_video, conf, classes],
205
+ outputs=[output_video, actual_fps],
206
+ )
207
+ classes.submit(set_classes, classes)
208
+ conf.change(set_confidence_threshold, conf)
209
+
210
+ submit.click(
211
+ fn=process_video,
212
+ inputs=[input_video, conf, classes],
213
+ outputs=[output_video, actual_fps],
214
+ )
215
+
216
+ if __name__ == "__main__":
217
+ demo.launch(show_error=True)
newyorkstreets_small.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ffa9291f629cda40826435df1a6de7cd6e348af9edb51b22e2056ecc9b535ae
3
+ size 39231000
output.ts ADDED
Binary file (434 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ timm
3
+ numpy==1.26.3
4
+ git+https://github.com/yonigozlan/transformers.git@add-om-det-turbo
5
+ opencv-python
6
+ supervision
7
+ gradio-client @ git+https://github.com/gradio-app/gradio@66349fe26827e3a3c15b738a1177e95fec7f5554#subdirectory=client/python
8
+ https://gradio-pypi-previews.s3.amazonaws.com/66349fe26827e3a3c15b738a1177e95fec7f5554/gradio-4.42.0-py3-none-any.whl