File size: 6,184 Bytes
64a1432
 
 
 
 
 
 
 
 
 
 
18c960c
 
 
 
 
 
 
 
 
cd1a88f
18c960c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd1a88f
18c960c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9253c20
18c960c
 
 
 
 
 
 
 
 
 
 
6eeb43a
18c960c
 
 
 
 
 
 
6eeb43a
f60f5dd
18c960c
 
 
 
 
cd1a88f
 
 
 
 
 
 
 
18c960c
1a09192
18c960c
1a09192
 
 
18c960c
1a09192
 
 
 
 
 
 
 
 
18c960c
cd1a88f
 
7f4e676
 
1a09192
cd1a88f
 
 
 
1a09192
18c960c
 
 
 
258f1d6
 
 
 
 
 
9b67e0b
18c960c
cd1a88f
 
 
 
 
18c960c
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import subprocess
import sys

def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

install("torch==2.4.1")
install("torchvision==0.19.1")
install("pixeltable==0.2.20")
install("git+https://github.com/Megvii-BaseDetection/YOLOX.git")

import gradio as gr
import pixeltable as pxt
from pixeltable.iterators import FrameIterator
from pixeltable.ext.functions.yolox import yolox
import PIL.Image
import PIL.ImageDraw

# Creating a UDF to draw bounding boxes
@pxt.udf
def draw_boxes(img: PIL.Image.Image, boxes: list[list[float]]) -> PIL.Image.Image:
    result = img.copy()  # Create a copy of `img`
    d = PIL.ImageDraw.Draw(result)
    for box in boxes:
        # Draw bounding box rectangles on the copied image
        d.rectangle(box, width=3)
    return result

# Gradio Application
def process_video(video_file, model_id, threshold, progress=gr.Progress()):
    progress(0, desc="Initializing...")

    # Ensure a clean slate for the demo
    pxt.drop_dir('video_tutorial', force=True)
    pxt.create_dir('video_tutorial')

    # Create the `videos` table
    videos_table = pxt.create_table(
        'video_tutorial.videos',
        {'video': pxt.VideoType()}
    )

    # Create a view for video frames
    frames_view = pxt.create_view(
        'video_tutorial.frames',
        videos_table,
        iterator=FrameIterator.create(video=videos_table.video, fps=5)
    )
    
    # Insert video into Pixeltable table
    videos_table.insert([{'video': video_file.name}])

    progress(0.3, desc="Running Model...")

    # Perform object detection
    frames_view[f'detect_{model_id}'] = yolox(
        frames_view.frame, model_id=model_id, threshold=threshold
    )

    progress(0.6, desc="Object detection completed...")
    
    # Prepare frame gallery
    frame_gallery = frames_view.select(frames_view.frame).where(frames_view.pos % 2 == 0).limit(10).collect()['frame']

    progress(0.8, desc="Outputs generated, retrieving video...")    

    # Generate output video with bounding boxes
    output_video = frames_view.group_by(videos_table).select(
        pxt.functions.video.make_video(
            frames_view.pos,
            draw_boxes(
                frames_view.frame,
                frames_view[f'detect_{model_id}'].bboxes
            )
        )
    ).collect()['col_0'][0]

    return output_video, frame_gallery

# Gradio interface
with gr.Blocks(theme=gr.themes.Base()) as demo:
    gr.Markdown(
        """
        <div max-width: 800px; margin: 0 auto;">
            <img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/source/data/pixeltable-logo-large.png" alt="Pixeltable" style="max-width: 200px; margin-bottom: 20px;" />
            <h1 style="margin-bottom: 0.5em;">Object Detection in Videos</h1>
        </div>
        """
    )
    gr.HTML(
        """
        <p>
            <a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #4D148C; text-decoration: none; font-weight: bold;">Pixeltable</a> is a declarative interface for working with text, images, embeddings, and even video, enabling you to store, transform, index, and iterate on data.
        </p>
        """
    )

    # Add the disclaimer
    gr.HTML(
        """
        <div style="background-color: #E5DDD4; border: 1px solid #e9ecef; color: #000000; border-radius: 8px; padding: 15px; margin-bottom: 20px;">
            <strong style="color: #000000">Disclaimer:</strong> This app is best run on your own hardware with a GPU for optimal performance. This Hugging Face Space uses the free tier (2vCPU, 16GB RAM), which may result in slower processing times, especially for large video files. If you wish to use this app with your own hardware for improved performance, you can <a href="https://huggingface.co/spaces/Pixeltable/object-detection-in-videos-with-yolox?duplicate=true" target="_blank" style="color: #4D148C; text-decoration: none; font-weight: bold;">duplicate this Hugging Face Space</a>, run it locally, or use Google Colab with the Free limited GPU support.
        </div>
        """
    )

    with gr.Row():
        with gr.Column():
            with gr.Accordion("What This Demo Does", open=True):
                gr.Markdown("""
                    1. **Ingests Videos**: Uploads your Video.
                    2. **Process and Retrieve Data**: Store, version, chunk, and retrieve video and frames.
                    3. **Detects Objects**: Leverages Pixeltable's YOLOX integration to produce object detection results.
                    4. **Visualizes Output**: Displays the processed video alongside a sample of the original frames.
                    """)

    # Input section
    with gr.Row():
        # Left column for video upload
        with gr.Column():
            video_file = gr.File(label="Upload Video", file_count="single")

        # Right column for model selection and threshold
        with gr.Column():
            model_id = gr.Radio(
                choices=['yolox_tiny', 'yolox_m', 'yolox_x'],
                value='yolox_tiny',
                label="YOLOX Model",
                interactive=True
            )
            threshold = gr.Slider(minimum=0.1, maximum=0.9, value=0.25, step=0.05, label="Threshold")
    
    gr.Examples(
        examples=[
            ["bangkok.mp4", "yolox_tiny", 0.25],
            ["lotr.mp4", "yolox_m", 0.3],
            ["mi.mp4", "yolox_x", 0.5],
        ],
        inputs=[video_file, model_id, threshold],
        fn=process_video
    )
 
    # Button to trigger file processing
    process_button = gr.Button("Process Video")
    
    with gr.Row():
        # Left column for video output
        with gr.Column(scale=1):
            output_video = gr.Video(label="Processed Video with Detections")
        
        # Right column for frame gallery
        with gr.Column(scale=1):
            frame_gallery = gr.Gallery(label="Frame Gallery", show_label=True, elem_id="gallery")

    process_button.click(
        process_video,
        inputs=[video_file, model_id, threshold],
        outputs=[output_video, frame_gallery]
    )

if __name__ == "__main__":
    demo.launch(debug=True)