|
import os |
|
|
|
|
|
import gradio as gr |
|
|
|
|
|
os.system("pip install git+https://github.com/huggingface/transformers.git") |
|
os.system("pip install datasets") |
|
os.system("pip install scipy") |
|
os.system("pip install torch") |
|
|
|
from transformers import AutoImageProcessor, AutoModelForImageClassification, DPTForDepthEstimation, Mask2FormerForUniversalSegmentation |
|
import torch |
|
import numpy as np |
|
from PIL import Image |
|
import requests |
|
from collections import defaultdict |
|
|
|
palette = np.asarray([ |
|
[0, 0, 0], |
|
[120, 120, 120], |
|
[180, 120, 120], |
|
[6, 230, 230], |
|
[80, 50, 50], |
|
[4, 200, 3], |
|
[120, 120, 80], |
|
[140, 140, 140], |
|
[204, 5, 255], |
|
[230, 230, 230], |
|
[4, 250, 7], |
|
[224, 5, 255], |
|
[235, 255, 7], |
|
[150, 5, 61], |
|
[120, 120, 70], |
|
[8, 255, 51], |
|
[255, 6, 82], |
|
[143, 255, 140], |
|
[204, 255, 4], |
|
[255, 51, 7], |
|
[204, 70, 3], |
|
[0, 102, 200], |
|
[61, 230, 250], |
|
[255, 6, 51], |
|
[11, 102, 255], |
|
[255, 7, 71], |
|
[255, 9, 224], |
|
[9, 7, 230], |
|
[220, 220, 220], |
|
[255, 9, 92], |
|
[112, 9, 255], |
|
[8, 255, 214], |
|
[7, 255, 224], |
|
[255, 184, 6], |
|
[10, 255, 71], |
|
[255, 41, 10], |
|
[7, 255, 255], |
|
[224, 255, 8], |
|
[102, 8, 255], |
|
[255, 61, 6], |
|
[255, 194, 7], |
|
[255, 122, 8], |
|
[0, 255, 20], |
|
[255, 8, 41], |
|
[255, 5, 153], |
|
[6, 51, 255], |
|
[235, 12, 255], |
|
[160, 150, 20], |
|
[0, 163, 255], |
|
[140, 140, 140], |
|
[250, 10, 15], |
|
[20, 255, 0], |
|
[31, 255, 0], |
|
[255, 31, 0], |
|
[255, 224, 0], |
|
[153, 255, 0], |
|
[0, 0, 255], |
|
[255, 71, 0], |
|
[0, 235, 255], |
|
[0, 173, 255], |
|
[31, 0, 255], |
|
[11, 200, 200], |
|
[255, 82, 0], |
|
[0, 255, 245], |
|
[0, 61, 255], |
|
[0, 255, 112], |
|
[0, 255, 133], |
|
[255, 0, 0], |
|
[255, 163, 0], |
|
[255, 102, 0], |
|
[194, 255, 0], |
|
[0, 143, 255], |
|
[51, 255, 0], |
|
[0, 82, 255], |
|
[0, 255, 41], |
|
[0, 255, 173], |
|
[10, 0, 255], |
|
[173, 255, 0], |
|
[0, 255, 153], |
|
[255, 92, 0], |
|
[255, 0, 255], |
|
[255, 0, 245], |
|
[255, 0, 102], |
|
[255, 173, 0], |
|
[255, 0, 20], |
|
[255, 184, 184], |
|
[0, 31, 255], |
|
[0, 255, 61], |
|
[0, 71, 255], |
|
[255, 0, 204], |
|
[0, 255, 194], |
|
[0, 255, 82], |
|
[0, 10, 255], |
|
[0, 112, 255], |
|
[51, 0, 255], |
|
[0, 194, 255], |
|
[0, 122, 255], |
|
[0, 255, 163], |
|
[255, 153, 0], |
|
[0, 255, 10], |
|
[255, 112, 0], |
|
[143, 255, 0], |
|
[82, 0, 255], |
|
[163, 255, 0], |
|
[255, 235, 0], |
|
[8, 184, 170], |
|
[133, 0, 255], |
|
[0, 255, 92], |
|
[184, 0, 255], |
|
[255, 0, 31], |
|
[0, 184, 255], |
|
[0, 214, 255], |
|
[255, 0, 112], |
|
[92, 255, 0], |
|
[0, 224, 255], |
|
[112, 224, 255], |
|
[70, 184, 160], |
|
[163, 0, 255], |
|
[153, 0, 255], |
|
[71, 255, 0], |
|
[255, 0, 163], |
|
[255, 204, 0], |
|
[255, 0, 143], |
|
[0, 255, 235], |
|
[133, 255, 0], |
|
[255, 0, 235], |
|
[245, 0, 255], |
|
[255, 0, 122], |
|
[255, 245, 0], |
|
[10, 190, 212], |
|
[214, 255, 0], |
|
[0, 204, 255], |
|
[20, 0, 255], |
|
[255, 255, 0], |
|
[0, 153, 255], |
|
[0, 41, 255], |
|
[0, 255, 204], |
|
[41, 0, 255], |
|
[41, 255, 0], |
|
[173, 0, 255], |
|
[0, 245, 255], |
|
[71, 0, 255], |
|
[122, 0, 255], |
|
[0, 255, 184], |
|
[0, 92, 255], |
|
[184, 255, 0], |
|
[0, 133, 255], |
|
[255, 214, 0], |
|
[25, 194, 194], |
|
[102, 255, 0], |
|
[92, 0, 255], |
|
]) |
|
|
|
depth_image_processor = AutoImageProcessor.from_pretrained("facebook/dpt-dinov2-small-nyu") |
|
depth_model = DPTForDepthEstimation.from_pretrained("facebook/dpt-dinov2-small-nyu") |
|
|
|
def compute_depth(img): |
|
|
|
inputs = depth_image_processor(images=img, return_tensors="pt") |
|
|
|
with torch.no_grad(): |
|
outputs = depth_model(**inputs) |
|
predicted_depth = outputs.predicted_depth |
|
|
|
|
|
prediction = torch.nn.functional.interpolate( |
|
predicted_depth.unsqueeze(1), |
|
size=img.size[::-1], |
|
mode="bicubic", |
|
align_corners=False, |
|
) |
|
|
|
|
|
output = prediction.squeeze().cpu().numpy() |
|
formatted = (output * 255 / np.max(output)).astype("uint8") |
|
depth = Image.fromarray(formatted) |
|
return [depth, "depth"] |
|
|
|
clas_processor = AutoImageProcessor.from_pretrained('facebook/dinov2-small-imagenet1k-1-layer') |
|
clas_model = AutoModelForImageClassification.from_pretrained('facebook/dinov2-small-imagenet1k-1-layer') |
|
|
|
def compute_clas(img): |
|
inputs = clas_processor(images=img, return_tensors="pt") |
|
outputs = clas_model(**inputs) |
|
logits = outputs.logits |
|
predicted_class_idx = logits.argmax(-1).item() |
|
return[img, clas_model.config.id2label[predicted_class_idx]] |
|
|
|
m2f_processor = AutoImageProcessor.from_pretrained("facebook/mask2former-swin-base-coco-panoptic") |
|
m2f_model = Mask2FormerForUniversalSegmentation.from_pretrained("facebook/mask2former-swin-base-coco-panoptic") |
|
|
|
def seg2sem(seg): |
|
color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8) |
|
|
|
handles = [] |
|
for label, color in enumerate(palette): |
|
color_seg[seg == label, :] = color |
|
if (seg == label).count_nonzero() > 0: |
|
handles.append(m2f_model.config.id2label[label]) |
|
handles.append(color) |
|
|
|
color_seg = color_seg.astype(np.uint8) |
|
image = Image.fromarray(color_seg) |
|
|
|
return [image,handles] |
|
|
|
def seg2pano(seg, segments_info): |
|
color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8) |
|
|
|
handles = [] |
|
for label, color in enumerate(palette): |
|
color_seg[seg == label, :] = color |
|
|
|
color_seg = color_seg.astype(np.uint8) |
|
image = Image.fromarray(color_seg) |
|
|
|
instances_counter = defaultdict(int) |
|
handles = [] |
|
for segment in segments_info: |
|
segment_id = segment['id'] |
|
segment_label_id = segment['label_id'] |
|
segment_label = m2f_model.config.id2label[segment_label_id] |
|
label = f"{segment_label}-{instances_counter[segment_label_id]}" |
|
instances_counter[segment_label_id] += 1 |
|
color = palette[segment_id] |
|
handles.append(label) |
|
handles.append(color) |
|
return [image,handles] |
|
|
|
def compute_m2f_sem_seg(img): |
|
inputs = m2f_processor(images=img, return_tensors="pt") |
|
|
|
with torch.no_grad(): |
|
outputs = m2f_model(**inputs) |
|
|
|
seg = m2f_processor.post_process_semantic_segmentation( |
|
outputs, target_sizes=[img.size[::-1]] |
|
)[0] |
|
|
|
return seg2sem(seg) |
|
|
|
def compute_m2f_pano_seg(img): |
|
inputs = m2f_processor(images=img, return_tensors="pt") |
|
|
|
with torch.no_grad(): |
|
outputs = m2f_model(**inputs) |
|
|
|
seg = m2f_processor.post_process_panoptic_segmentation( |
|
outputs, target_sizes=[img.size[::-1]] |
|
)[0] |
|
|
|
return seg2pano(seg["segmentation"], seg["segments_info"]) |
|
|
|
labels = ["Dinov2 - Depth", "Dinov2 - Classification", "M2F - Semantic Segmentation", "M2F - Panoptic Segmentation"] |
|
|
|
|
|
def detect(img, application): |
|
if application == labels[0]: |
|
return compute_depth(img) |
|
elif application == labels[1]: |
|
return compute_clas(img) |
|
elif application == labels[2]: |
|
return compute_m2f_sem_seg(img) |
|
elif application == labels[3]: |
|
return compute_m2f_pano_seg(img) |
|
return img |
|
|
|
|
|
iface = gr.Interface(fn=detect, inputs=[gr.Image(type="pil"), gr.Radio(labels, label="Application")], outputs=[gr.Image(type="pil"), gr.Textbox()]) |
|
iface.launch(debug=True) |