Spaces:
Sleeping
Sleeping
from enum import Enum | |
import numpy as np | |
import gradio as gr | |
import torch | |
from PIL import Image | |
from transformers import DPTImageProcessor, DPTForDepthEstimation | |
from typing import List, Tuple | |
import random | |
from PIL import ImageDraw, ImageFont | |
from gradio.components import Image as grImage | |
import mediapipe as mp | |
processor = DPTImageProcessor.from_pretrained("Intel/dpt-large") | |
model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large") | |
detector = mp.solutions.face_detection.FaceDetection(model_selection=1, min_detection_confidence=0.5) | |
class Placement(Enum): | |
CENTER = 0 | |
TOP = 1 | |
class FaceKeypointsLabel(Enum): | |
OTHER = 0 | |
NOSE = 1 | |
class Keypoints: | |
def __init__(self, x: float, y: float, label: FaceKeypointsLabel): | |
""" | |
:param x: x coordinate of the keypoint, normalized between 0 and 1 | |
:param y: y coordinate of the keypoint, normalized between 0 and 1 | |
""" | |
self.x = x | |
self.y = y | |
self.label = label | |
class BoundingBox: | |
def __init__(self, x_min: int, y_min: int, width: int, height: int): | |
self.x_min = x_min | |
self.y_min = y_min | |
self.width = width | |
self.height = height | |
class FaceDetectionResult: | |
""" | |
A class to represent the result of a face detection | |
""" | |
def __init__(self, bounding_box : BoundingBox, keypoints: List[Keypoints]): | |
self.bounding_box = bounding_box | |
self.keypoints = keypoints | |
def detect_face(image: Image) -> List[any]: | |
""" | |
Use mediapipe to detect faces in an image | |
""" | |
result = detector.process(np.array(image)) | |
if result.detections is None: | |
return [] | |
return result.detections | |
def predict_depth(image: Image) -> np.ndarray: | |
""" | |
Predict depth for an image | |
""" | |
inputs = processor(images=image, return_tensors="pt") | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
predicted_depth = outputs.predicted_depth | |
# Interpolate to original size | |
prediction = torch.nn.functional.interpolate( | |
predicted_depth.unsqueeze(1), | |
size=image.size[::-1], | |
mode="bicubic", | |
align_corners=False, | |
) | |
output = prediction.squeeze().cpu().numpy() | |
return (output * 255 / np.max(output)).astype("uint8") | |
def estimate_depth_at_points(depth_map: np.ndarray, coordinates: List[Tuple[int, int]]) -> List[float]: | |
""" | |
Get the depth at a given coordinates | |
""" | |
depth_estimates = [] | |
# Iterate through the given coordinates and estimate depth at each point | |
for x, y in coordinates: | |
depth_estimate = depth_map[y, x] # Access depth at the given point | |
depth_estimates.append(depth_estimate) | |
return depth_estimates | |
class Person: | |
""" | |
A class to represent a person in an image | |
""" | |
def __init__(self, nose_x: int, nose_y: int, head_width: int, head_height: int, middle_top_head_x: int, middle_top_head_y: int): | |
self.nose_x = nose_x | |
self.nose_y = nose_y | |
self.head_width = head_width | |
self.head_height = head_height | |
self.middle_top_head_x = middle_top_head_x | |
self.middle_top_head_y = middle_top_head_y | |
self.nose_width = int(head_width / 5) | |
self.nose_height = int(head_height / 3) | |
def extract_persons(face_detection_results: List[FaceDetectionResult], image: Image) -> List[Person]: | |
""" | |
Extract a list of people from a face detection result | |
""" | |
persons = [] | |
for face_result in face_detection_results: | |
bbox = face_result.bounding_box | |
keypoints = face_result.keypoints | |
# Assuming the nose is the first keypoint in the list. | |
# You might need to adjust this based on how keypoints are ordered. | |
for keypoint in keypoints: | |
if keypoint.label == FaceKeypointsLabel.NOSE: | |
nose_keypoint = keypoint | |
break | |
nose_x = int(nose_keypoint.x * image.width) | |
nose_y = int(nose_keypoint.y * image.height) | |
# Bounding box details | |
middle_top_head_x = int(bbox.x_min + bbox.width // 2) | |
middle_top_head_y = bbox.y_min | |
head_width = bbox.width | |
head_height = bbox.height | |
# Create and add Person object | |
person = Person(nose_x, nose_y, head_width, head_height, middle_top_head_x, middle_top_head_y) | |
persons.append(person) | |
return persons | |
def add_mask(image: Image, mask: Image, coordinate: Tuple[int, int], size: Tuple[int, int], placement: Placement) -> Image: | |
""" | |
Add a mask (a static image) to an image | |
""" | |
# maintain aspect ratio | |
if len(size) == 1: | |
height = mask.height | |
width = mask.width | |
ratio = height / width | |
size = (size[0], int(size[0] * ratio)) | |
if placement == Placement.CENTER: | |
coordinate = (coordinate[0] - size[0] // 2, coordinate[1] - size[1] // 2) | |
elif placement == Placement.TOP: | |
coordinate = (coordinate[0] - size[0] // 2, coordinate[1] - size[1]) | |
mask = mask.resize(size) | |
image.paste(mask, coordinate, mask) | |
return image | |
def draw_attributes(image: Image, persons: List[Person]) -> Image: | |
""" | |
Debug function to the face recognition attributes on an image | |
""" | |
draw = ImageDraw.Draw(image) | |
font = ImageFont.load_default() | |
for person in persons: | |
# Draw a circle at the nose position | |
draw.ellipse([(person.nose_x - 5, person.nose_y - 5), (person.nose_x + 5, person.nose_y + 5)], fill=(0, 255, 0)) | |
# Draw the head rectangle | |
draw.rectangle([(person.middle_top_head_x - person.head_width // 2, person.middle_top_head_y), | |
(person.middle_top_head_x + person.head_width // 2, person.middle_top_head_y + person.head_height)], | |
outline=(0, 255, 0)) | |
# Put text for dimensions | |
draw.text((person.middle_top_head_x, person.middle_top_head_y - 20), f"Width: {person.head_width}, Height: {person.head_height}", fill=(255, 255, 255), font=font) | |
# put location of nose | |
draw.text((person.nose_x, person.nose_y + 10), f"({person.nose_x}, {person.nose_y})", fill=(255, 255, 255), font=font) | |
# draw dot at middle top head | |
draw.ellipse([(person.middle_top_head_x - 5, person.middle_top_head_y - 5), (person.middle_top_head_x + 5, person.middle_top_head_y + 5)], fill=(255, 0, 0)) | |
return image | |
def apply_reindeer_mask(image: Image, person: Person) -> Image: | |
""" | |
Apply a reindeer mask to a person in an image | |
""" | |
reindeer_nose = Image.open("mask/reindeer_nose.png") | |
reindeer_antlers = Image.open("mask/reindeer_antlers.png") | |
reindeer_nose_coordinate = (person.nose_x, person.nose_y) | |
reindeer_nose_size = (person.nose_height, person.nose_height) | |
image = add_mask(image, reindeer_nose, reindeer_nose_coordinate, reindeer_nose_size, Placement.CENTER) | |
reindeer_antlers_size = (person.head_width, ) | |
reindeer_antlers_coordinate = (person.middle_top_head_x, person.middle_top_head_y) | |
image = add_mask(image, reindeer_antlers, reindeer_antlers_coordinate, reindeer_antlers_size, Placement.TOP) | |
return image | |
def apply_santa_hat_mask(image: Image, person: Person) -> Image: | |
""" | |
Apply a santa hat mask to a person in an image | |
""" | |
santa_hat = Image.open("mask/santa_hat.png") | |
santa_hat_size = (person.head_width, ) | |
santa_hat_coordinate = (person.middle_top_head_x, person.middle_top_head_y) | |
image = add_mask(image, santa_hat, santa_hat_coordinate, santa_hat_size, Placement.TOP) | |
return image | |
def add_text(image: Image, text: str, font_size: int = 30) -> Image: | |
""" | |
Add text to an image | |
""" | |
draw = ImageDraw.Draw(image) | |
text_x = image.width // 2 | |
text_y = image.height // 2 | |
draw.text((text_x, text_y), text, fill=(255, 0, 0)) | |
return image | |
def apply_random_mask(image: Image, person: Person) -> Image: | |
""" | |
Apply a random mask to a person in an image | |
""" | |
mask = random.choice([apply_santa_hat_mask, apply_reindeer_mask]) | |
image = mask(image, person) | |
return image | |
def process_image(image : Image): | |
""" | |
The full pipeline that take an image and returns an image with more christmas spirit :) | |
""" | |
# Potential improvement this could be done in parallel | |
depth_result = predict_depth(image) | |
detections = detect_face(image) | |
face_detection_results = parse_detection_result(detections, image) | |
persons = extract_persons(face_detection_results, image) | |
if len(persons) == 0: | |
return add_text(image, "No faces detected in the image") | |
if len(persons) == 1: | |
image = apply_random_mask(image,persons[0]) | |
elif len(persons) > 1: | |
# Apply the rules of the assignment, closest person gets santa hat, furthest person gets reindeer mask | |
# All other people get a random mask (either santa hat or reindeer mask) (as this was not specified in the assignment) | |
depth_estimates = estimate_depth_at_points(depth_result, [(person.nose_x, person.nose_y) for person in persons]) | |
closest_camera_index = np.argmin(depth_estimates) | |
furthest_camera_index = np.argmax(depth_estimates) | |
santa_person = persons[closest_camera_index] | |
reindeer_person = persons[furthest_camera_index] | |
image = apply_reindeer_mask(image, reindeer_person) | |
image = apply_santa_hat_mask(image, santa_person) | |
for i, person in enumerate(persons): | |
if i != closest_camera_index and i != furthest_camera_index: | |
image = apply_random_mask(image, person) | |
return image | |
def parse_detection_to_face_detection_result(detection, image_width: int, image_height: int) -> FaceDetectionResult: | |
""" | |
Parse a mediapipe detection to a FaceDetectionResult | |
""" | |
# Extract bounding box | |
bbox = detection.location_data.relative_bounding_box | |
x_min = int(bbox.xmin * image_width) | |
y_min = int(bbox.ymin * image_height) | |
width = int(bbox.width * image_width) | |
height = int(bbox.height * image_height) | |
bounding_box = BoundingBox(x_min, y_min, width, height) | |
# Extract keypoints | |
keypoints = [] | |
for i, keypoint in enumerate(detection.location_data.relative_keypoints): | |
x = keypoint.x | |
y = keypoint.y | |
face_type = FaceKeypointsLabel.OTHER | |
if i == 2: | |
face_type = FaceKeypointsLabel.NOSE | |
keypoints.append(Keypoints(x, y, face_type)) | |
return FaceDetectionResult(bounding_box, keypoints) | |
def parse_detection_result(detection_result, image: Image) -> List[FaceDetectionResult]: | |
""" | |
Parse a mediapipe detection result to a list of FaceDetectionResult | |
""" | |
face_detection_results = [] | |
for detection in detection_result: | |
face_detection_result = parse_detection_to_face_detection_result(detection, image.width, image.height) | |
face_detection_results.append(face_detection_result) | |
return face_detection_results | |
def main(): | |
# Remarks: the code is in one file for simplicity, but it would be better to split it up in multiple files | |
# Create a gradio interface | |
iface = gr.Interface( | |
fn=process_image, | |
inputs=grImage(type="pil"), | |
outputs=grImage(type="pil"), | |
title="Image Processor", | |
description="Upload an image to detect faces and apply transformations." | |
) | |
# Launch the interface | |
iface.launch() | |
if __name__ == "__main__": | |
main() | |