File size: 6,184 Bytes
64a1432 18c960c cd1a88f 18c960c cd1a88f 18c960c 9253c20 18c960c 6eeb43a 18c960c 6eeb43a f60f5dd 18c960c cd1a88f 18c960c 1a09192 18c960c 1a09192 18c960c 1a09192 18c960c cd1a88f 7f4e676 1a09192 cd1a88f 1a09192 18c960c 258f1d6 9b67e0b 18c960c cd1a88f 18c960c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
import subprocess
import sys
def install(package):
subprocess.check_call([sys.executable, "-m", "pip", "install", package])
install("torch==2.4.1")
install("torchvision==0.19.1")
install("pixeltable==0.2.20")
install("git+https://github.com/Megvii-BaseDetection/YOLOX.git")
import gradio as gr
import pixeltable as pxt
from pixeltable.iterators import FrameIterator
from pixeltable.ext.functions.yolox import yolox
import PIL.Image
import PIL.ImageDraw
# Creating a UDF to draw bounding boxes
@pxt.udf
def draw_boxes(img: PIL.Image.Image, boxes: list[list[float]]) -> PIL.Image.Image:
result = img.copy() # Create a copy of `img`
d = PIL.ImageDraw.Draw(result)
for box in boxes:
# Draw bounding box rectangles on the copied image
d.rectangle(box, width=3)
return result
# Gradio Application
def process_video(video_file, model_id, threshold, progress=gr.Progress()):
progress(0, desc="Initializing...")
# Ensure a clean slate for the demo
pxt.drop_dir('video_tutorial', force=True)
pxt.create_dir('video_tutorial')
# Create the `videos` table
videos_table = pxt.create_table(
'video_tutorial.videos',
{'video': pxt.VideoType()}
)
# Create a view for video frames
frames_view = pxt.create_view(
'video_tutorial.frames',
videos_table,
iterator=FrameIterator.create(video=videos_table.video, fps=5)
)
# Insert video into Pixeltable table
videos_table.insert([{'video': video_file.name}])
progress(0.3, desc="Running Model...")
# Perform object detection
frames_view[f'detect_{model_id}'] = yolox(
frames_view.frame, model_id=model_id, threshold=threshold
)
progress(0.6, desc="Object detection completed...")
# Prepare frame gallery
frame_gallery = frames_view.select(frames_view.frame).where(frames_view.pos % 2 == 0).limit(10).collect()['frame']
progress(0.8, desc="Outputs generated, retrieving video...")
# Generate output video with bounding boxes
output_video = frames_view.group_by(videos_table).select(
pxt.functions.video.make_video(
frames_view.pos,
draw_boxes(
frames_view.frame,
frames_view[f'detect_{model_id}'].bboxes
)
)
).collect()['col_0'][0]
return output_video, frame_gallery
# Gradio interface
with gr.Blocks(theme=gr.themes.Base()) as demo:
gr.Markdown(
"""
<div max-width: 800px; margin: 0 auto;">
<img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/source/data/pixeltable-logo-large.png" alt="Pixeltable" style="max-width: 200px; margin-bottom: 20px;" />
<h1 style="margin-bottom: 0.5em;">Object Detection in Videos</h1>
</div>
"""
)
gr.HTML(
"""
<p>
<a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #4D148C; text-decoration: none; font-weight: bold;">Pixeltable</a> is a declarative interface for working with text, images, embeddings, and even video, enabling you to store, transform, index, and iterate on data.
</p>
"""
)
# Add the disclaimer
gr.HTML(
"""
<div style="background-color: #E5DDD4; border: 1px solid #e9ecef; color: #000000; border-radius: 8px; padding: 15px; margin-bottom: 20px;">
<strong style="color: #000000">Disclaimer:</strong> This app is best run on your own hardware with a GPU for optimal performance. This Hugging Face Space uses the free tier (2vCPU, 16GB RAM), which may result in slower processing times, especially for large video files. If you wish to use this app with your own hardware for improved performance, you can <a href="https://huggingface.co/spaces/Pixeltable/object-detection-in-videos-with-yolox?duplicate=true" target="_blank" style="color: #4D148C; text-decoration: none; font-weight: bold;">duplicate this Hugging Face Space</a>, run it locally, or use Google Colab with the Free limited GPU support.
</div>
"""
)
with gr.Row():
with gr.Column():
with gr.Accordion("What This Demo Does", open=True):
gr.Markdown("""
1. **Ingests Videos**: Uploads your Video.
2. **Process and Retrieve Data**: Store, version, chunk, and retrieve video and frames.
3. **Detects Objects**: Leverages Pixeltable's YOLOX integration to produce object detection results.
4. **Visualizes Output**: Displays the processed video alongside a sample of the original frames.
""")
# Input section
with gr.Row():
# Left column for video upload
with gr.Column():
video_file = gr.File(label="Upload Video", file_count="single")
# Right column for model selection and threshold
with gr.Column():
model_id = gr.Radio(
choices=['yolox_tiny', 'yolox_m', 'yolox_x'],
value='yolox_tiny',
label="YOLOX Model",
interactive=True
)
threshold = gr.Slider(minimum=0.1, maximum=0.9, value=0.25, step=0.05, label="Threshold")
gr.Examples(
examples=[
["bangkok.mp4", "yolox_tiny", 0.25],
["lotr.mp4", "yolox_m", 0.3],
["mi.mp4", "yolox_x", 0.5],
],
inputs=[video_file, model_id, threshold],
fn=process_video
)
# Button to trigger file processing
process_button = gr.Button("Process Video")
with gr.Row():
# Left column for video output
with gr.Column(scale=1):
output_video = gr.Video(label="Processed Video with Detections")
# Right column for frame gallery
with gr.Column(scale=1):
frame_gallery = gr.Gallery(label="Frame Gallery", show_label=True, elem_id="gallery")
process_button.click(
process_video,
inputs=[video_file, model_id, threshold],
outputs=[output_video, frame_gallery]
)
if __name__ == "__main__":
demo.launch(debug=True) |