File size: 3,180 Bytes
780389c
8b2cbe6
1e8e71b
 
ccc35d4
082831f
 
8fbff22
8b2cbe6
385e56e
082831f
 
1e8e71b
082831f
8fbff22
 
 
082831f
 
790227b
ccc35d4
 
 
790227b
 
 
ccc35d4
 
 
 
 
 
 
082831f
ccc35d4
1e8e71b
66947f7
385e56e
 
082831f
385e56e
 
 
790227b
082831f
790227b
082831f
 
385e56e
082831f
 
 
8b2cbe6
790227b
 
ccc35d4
8b2cbe6
ccc35d4
8b2cbe6
 
 
66947f7
8b2cbe6
790227b
 
8b2cbe6
 
 
66947f7
8b2cbe6
790227b
 
 
 
 
 
 
 
 
66947f7
 
 
 
 
 
 
 
 
 
 
082831f
790227b
66947f7
790227b
ccc35d4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import spaces
import gradio as gr
import cv2
import tempfile
from PIL import Image, ImageDraw, ImageFont
from transformers import RTDetrForObjectDetection, RTDetrImageProcessor
import torch
import requests

image_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd")
model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd", torch_dtype=torch.float16).to("cuda")
model = torch.compile(model, mode="reduce-overhead")

# Compile by running inference
url = 'http://images.cocodataset.org/val2017/000000039769.jpg' 
image = Image.open(requests.get(url, stream=True).raw)
inputs = image_processor(images=image, return_tensors="pt").to("cuda", torch.float16)
with torch.no_grad():
    outputs = model(**inputs)

def draw_bounding_boxes(image, results, model, threshold=0.3):
    draw = ImageDraw.Draw(image)
    for result in results:
        for score, label_id, box in zip(
            result["scores"], result["labels"], result["boxes"]
        ):
            if score > threshold:
                label = model.config.id2label[label_id.item()]
                box = [round(i) for i in box.tolist()]
                draw.rectangle(box, outline="red", width=3)
                draw.text((box[0], box[1]), f"{label}: {score:.2f}", fill="red")
    return image

import time

@spaces.GPU
def inference(image, conf_threshold):
    inputs = image_processor(images=image, return_tensors="pt")

    start = time.time()
    with torch.no_grad():
        outputs = model(**inputs)

    results = image_processor.post_process_object_detection(
        outputs, target_sizes=torch.tensor([image.size[::-1]]), threshold=conf_threshold
    )
    end = time.time()
    print("time: ", end - start)

    bbs = draw_bounding_boxes(image, results, model, threshold=conf_threshold)
    print("bbs: ", time.time() - end)
    return bbs


css = """.my-group {max-width: 600px !important; max-height: 600 !important;}
                      .my-column {display: flex !important; justify-content: center !important; align-items: center !important};"""

with gr.Blocks(css=css) as app:
    gr.HTML(
        """
    <h1 style='text-align: center'>
    Near Real-Time Webcam Stream with RT-DETR
    </h1>
    """
    )
    gr.HTML(
        """
        <h3 style='text-align: center'>
        <a href='https://arxiv.org/abs/2304.08069' target='_blank'>arXiv</a> | <a href='https://github.com/lyuwenyu/RT-DETR' target='_blank'>github</a>
        </h3>
        """
    )
    with gr.Column(elem_classes=["my-column"]):
        with gr.Group(elem_classes=["my-group"]):
            image = gr.Image(
                type="pil",
                label="Image",
                sources="webcam",
            )
            conf_threshold = gr.Slider(
                label="Confidence Threshold",
                minimum=0.0,
                maximum=1.0,
                step=0.05,
                value=0.85,
            )
            image.stream(
                fn=inference,
                inputs=[image, conf_threshold],
                outputs=[image],
                stream_every=0.1,
                time_limit=30,
            )
if __name__ == "__main__":
    app.launch()