Spaces:

Pixeltable
/

object-detection-in-videos-with-yolox

Running

App Files Files Community

object-detection-in-videos-with-yolox / app.py

PierreBrunelle

Update app.py

9253c20 verified about 1 month ago

raw

history blame contribute delete

6.18 kB

	import subprocess
	import sys

	def install(package):
	subprocess.check_call([sys.executable, "-m", "pip", "install", package])

	install("torch==2.4.1")
	install("torchvision==0.19.1")
	install("pixeltable==0.2.20")
	install("git+https://github.com/Megvii-BaseDetection/YOLOX.git")

	import gradio as gr
	import pixeltable as pxt
	from pixeltable.iterators import FrameIterator
	from pixeltable.ext.functions.yolox import yolox
	import PIL.Image
	import PIL.ImageDraw

	# Creating a UDF to draw bounding boxes
	@pxt.udf
	def draw_boxes(img: PIL.Image.Image, boxes: list[list[float]]) -> PIL.Image.Image:
	result = img.copy() # Create a copy of `img`
	d = PIL.ImageDraw.Draw(result)
	for box in boxes:
	# Draw bounding box rectangles on the copied image
	d.rectangle(box, width=3)
	return result

	# Gradio Application
	def process_video(video_file, model_id, threshold, progress=gr.Progress()):
	progress(0, desc="Initializing...")

	# Ensure a clean slate for the demo
	pxt.drop_dir('video_tutorial', force=True)
	pxt.create_dir('video_tutorial')

	# Create the `videos` table
	videos_table = pxt.create_table(
	'video_tutorial.videos',
	{'video': pxt.VideoType()}
	)

	# Create a view for video frames
	frames_view = pxt.create_view(
	'video_tutorial.frames',
	videos_table,
	iterator=FrameIterator.create(video=videos_table.video, fps=5)
	)

	# Insert video into Pixeltable table
	videos_table.insert([{'video': video_file.name}])

	progress(0.3, desc="Running Model...")

	# Perform object detection
	frames_view[f'detect_{model_id}'] = yolox(
	frames_view.frame, model_id=model_id, threshold=threshold
	)

	progress(0.6, desc="Object detection completed...")

	# Prepare frame gallery
	frame_gallery = frames_view.select(frames_view.frame).where(frames_view.pos % 2 == 0).limit(10).collect()['frame']

	progress(0.8, desc="Outputs generated, retrieving video...")

	# Generate output video with bounding boxes
	output_video = frames_view.group_by(videos_table).select(
	pxt.functions.video.make_video(
	frames_view.pos,
	draw_boxes(
	frames_view.frame,
	frames_view[f'detect_{model_id}'].bboxes
	)
	)
	).collect()['col_0'][0]

	return output_video, frame_gallery

	# Gradio interface
	with gr.Blocks(theme=gr.themes.Base()) as demo:
	gr.Markdown(
	"""
	<div max-width: 800px; margin: 0 auto;">
	<img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/source/data/pixeltable-logo-large.png" alt="Pixeltable" style="max-width: 200px; margin-bottom: 20px;" />
	<h1 style="margin-bottom: 0.5em;">Object Detection in Videos</h1>
	</div>
	"""
	)
	gr.HTML(
	"""
	<p>
	<a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #4D148C; text-decoration: none; font-weight: bold;">Pixeltable</a> is a declarative interface for working with text, images, embeddings, and even video, enabling you to store, transform, index, and iterate on data.
	</p>
	"""
	)

	# Add the disclaimer
	gr.HTML(
	"""
	<div style="background-color: #E5DDD4; border: 1px solid #e9ecef; color: #000000; border-radius: 8px; padding: 15px; margin-bottom: 20px;">
	<strong style="color: #000000">Disclaimer:</strong> This app is best run on your own hardware with a GPU for optimal performance. This Hugging Face Space uses the free tier (2vCPU, 16GB RAM), which may result in slower processing times, especially for large video files. If you wish to use this app with your own hardware for improved performance, you can <a href="https://huggingface.co/spaces/Pixeltable/object-detection-in-videos-with-yolox?duplicate=true" target="_blank" style="color: #4D148C; text-decoration: none; font-weight: bold;">duplicate this Hugging Face Space</a>, run it locally, or use Google Colab with the Free limited GPU support.
	</div>
	"""
	)

	with gr.Row():
	with gr.Column():
	with gr.Accordion("What This Demo Does", open=True):
	gr.Markdown("""
	1. Ingests Videos: Uploads your Video.
	2. Process and Retrieve Data: Store, version, chunk, and retrieve video and frames.
	3. Detects Objects: Leverages Pixeltable's YOLOX integration to produce object detection results.
	4. Visualizes Output: Displays the processed video alongside a sample of the original frames.
	""")

	# Input section
	with gr.Row():
	# Left column for video upload
	with gr.Column():
	video_file = gr.File(label="Upload Video", file_count="single")

	# Right column for model selection and threshold
	with gr.Column():
	model_id = gr.Radio(
	choices=['yolox_tiny', 'yolox_m', 'yolox_x'],
	value='yolox_tiny',
	label="YOLOX Model",
	interactive=True
	)
	threshold = gr.Slider(minimum=0.1, maximum=0.9, value=0.25, step=0.05, label="Threshold")

	gr.Examples(
	examples=[
	["bangkok.mp4", "yolox_tiny", 0.25],
	["lotr.mp4", "yolox_m", 0.3],
	["mi.mp4", "yolox_x", 0.5],
	],
	inputs=[video_file, model_id, threshold],
	fn=process_video
	)

	# Button to trigger file processing
	process_button = gr.Button("Process Video")

	with gr.Row():
	# Left column for video output
	with gr.Column(scale=1):
	output_video = gr.Video(label="Processed Video with Detections")

	# Right column for frame gallery
	with gr.Column(scale=1):
	frame_gallery = gr.Gallery(label="Frame Gallery", show_label=True, elem_id="gallery")

	process_button.click(
	process_video,
	inputs=[video_file, model_id, threshold],
	outputs=[output_video, frame_gallery]
	)

	if __name__ == "__main__":
	demo.launch(debug=True)