|
import subprocess |
|
import sys |
|
|
|
def install(package): |
|
subprocess.check_call([sys.executable, "-m", "pip", "install", package]) |
|
|
|
install("torch==2.4.1") |
|
install("torchvision==0.19.1") |
|
install("pixeltable==0.2.20") |
|
install("git+https://github.com/Megvii-BaseDetection/YOLOX.git") |
|
|
|
import gradio as gr |
|
import pixeltable as pxt |
|
from pixeltable.iterators import FrameIterator |
|
from pixeltable.ext.functions.yolox import yolox |
|
import PIL.Image |
|
import PIL.ImageDraw |
|
|
|
|
|
@pxt.udf |
|
def draw_boxes(img: PIL.Image.Image, boxes: list[list[float]]) -> PIL.Image.Image: |
|
result = img.copy() |
|
d = PIL.ImageDraw.Draw(result) |
|
for box in boxes: |
|
|
|
d.rectangle(box, width=3) |
|
return result |
|
|
|
|
|
def process_video(video_file, model_id, threshold, progress=gr.Progress()): |
|
progress(0, desc="Initializing...") |
|
|
|
|
|
pxt.drop_dir('video_tutorial', force=True) |
|
pxt.create_dir('video_tutorial') |
|
|
|
|
|
videos_table = pxt.create_table( |
|
'video_tutorial.videos', |
|
{'video': pxt.VideoType()} |
|
) |
|
|
|
|
|
frames_view = pxt.create_view( |
|
'video_tutorial.frames', |
|
videos_table, |
|
iterator=FrameIterator.create(video=videos_table.video, fps=5) |
|
) |
|
|
|
|
|
videos_table.insert([{'video': video_file.name}]) |
|
|
|
progress(0.3, desc="Running Model...") |
|
|
|
|
|
frames_view[f'detect_{model_id}'] = yolox( |
|
frames_view.frame, model_id=model_id, threshold=threshold |
|
) |
|
|
|
progress(0.6, desc="Object detection completed...") |
|
|
|
|
|
frame_gallery = frames_view.select(frames_view.frame).where(frames_view.pos % 2 == 0).limit(10).collect()['frame'] |
|
|
|
progress(0.8, desc="Outputs generated, retrieving video...") |
|
|
|
|
|
output_video = frames_view.group_by(videos_table).select( |
|
pxt.functions.video.make_video( |
|
frames_view.pos, |
|
draw_boxes( |
|
frames_view.frame, |
|
frames_view[f'detect_{model_id}'].bboxes |
|
) |
|
) |
|
).collect()['col_0'][0] |
|
|
|
return output_video, frame_gallery |
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Base()) as demo: |
|
gr.Markdown( |
|
""" |
|
<div max-width: 800px; margin: 0 auto;"> |
|
<img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/source/data/pixeltable-logo-large.png" alt="Pixeltable" style="max-width: 200px; margin-bottom: 20px;" /> |
|
<h1 style="margin-bottom: 0.5em;">Object Detection in Videos</h1> |
|
</div> |
|
""" |
|
) |
|
gr.HTML( |
|
""" |
|
<p> |
|
<a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #4D148C; text-decoration: none; font-weight: bold;">Pixeltable</a> is a declarative interface for working with text, images, embeddings, and even video, enabling you to store, transform, index, and iterate on data. |
|
</p> |
|
""" |
|
) |
|
|
|
|
|
gr.HTML( |
|
""" |
|
<div style="background-color: #E5DDD4; border: 1px solid #e9ecef; color: #000000; border-radius: 8px; padding: 15px; margin-bottom: 20px;"> |
|
<strong style="color: #000000">Disclaimer:</strong> This app is best run on your own hardware with a GPU for optimal performance. This Hugging Face Space uses the free tier (2vCPU, 16GB RAM), which may result in slower processing times, especially for large video files. If you wish to use this app with your own hardware for improved performance, you can <a href="https://huggingface.co/spaces/Pixeltable/object-detection-in-videos-with-yolox?duplicate=true" target="_blank" style="color: #4D148C; text-decoration: none; font-weight: bold;">duplicate this Hugging Face Space</a>, run it locally, or use Google Colab with the Free limited GPU support. |
|
</div> |
|
""" |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
with gr.Accordion("What This Demo Does", open=True): |
|
gr.Markdown(""" |
|
1. **Ingests Videos**: Uploads your Video. |
|
2. **Process and Retrieve Data**: Store, version, chunk, and retrieve video and frames. |
|
3. **Detects Objects**: Leverages Pixeltable's YOLOX integration to produce object detection results. |
|
4. **Visualizes Output**: Displays the processed video alongside a sample of the original frames. |
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
|
with gr.Column(): |
|
video_file = gr.File(label="Upload Video", file_count="single") |
|
|
|
|
|
with gr.Column(): |
|
model_id = gr.Radio( |
|
choices=['yolox_tiny', 'yolox_m', 'yolox_x'], |
|
value='yolox_tiny', |
|
label="YOLOX Model", |
|
interactive=True |
|
) |
|
threshold = gr.Slider(minimum=0.1, maximum=0.9, value=0.25, step=0.05, label="Threshold") |
|
|
|
gr.Examples( |
|
examples=[ |
|
["bangkok.mp4", "yolox_tiny", 0.25], |
|
["lotr.mp4", "yolox_m", 0.3], |
|
["mi.mp4", "yolox_x", 0.5], |
|
], |
|
inputs=[video_file, model_id, threshold], |
|
fn=process_video |
|
) |
|
|
|
|
|
process_button = gr.Button("Process Video") |
|
|
|
with gr.Row(): |
|
|
|
with gr.Column(scale=1): |
|
output_video = gr.Video(label="Processed Video with Detections") |
|
|
|
|
|
with gr.Column(scale=1): |
|
frame_gallery = gr.Gallery(label="Frame Gallery", show_label=True, elem_id="gallery") |
|
|
|
process_button.click( |
|
process_video, |
|
inputs=[video_file, model_id, threshold], |
|
outputs=[output_video, frame_gallery] |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch(debug=True) |