from typing import List, Optional, Union, Tuple import cv2 import numpy as np from supervision.detection.core import Detections from supervision.draw.color import Color, ColorPalette class BoxAnnotator: """ A class for drawing bounding boxes on an image using detections provided. Attributes: color (Union[Color, ColorPalette]): The color to draw the bounding box, can be a single color or a color palette thickness (int): The thickness of the bounding box lines, default is 2 text_color (Color): The color of the text on the bounding box, default is white text_scale (float): The scale of the text on the bounding box, default is 0.5 text_thickness (int): The thickness of the text on the bounding box, default is 1 text_padding (int): The padding around the text on the bounding box, default is 5 """ def __init__( self, color: Union[Color, ColorPalette] = ColorPalette.DEFAULT, thickness: int = 3, # 1 for seeclick 2 for mind2web and 3 for demo text_color: Color = Color.BLACK, text_scale: float = 0.5, # 0.8 for mobile/web, 0.3 for desktop # 0.4 for mind2web text_thickness: int = 2, #1, # 2 for demo text_padding: int = 10, avoid_overlap: bool = True, ): self.color: Union[Color, ColorPalette] = color self.thickness: int = thickness self.text_color: Color = text_color self.text_scale: float = text_scale self.text_thickness: int = text_thickness self.text_padding: int = text_padding self.avoid_overlap: bool = avoid_overlap def annotate( self, scene: np.ndarray, detections: Detections, labels: Optional[List[str]] = None, skip_label: bool = False, image_size: Optional[Tuple[int, int]] = None, ) -> np.ndarray: """ Draws bounding boxes on the frame using the detections provided. Args: scene (np.ndarray): The image on which the bounding boxes will be drawn detections (Detections): The detections for which the bounding boxes will be drawn labels (Optional[List[str]]): An optional list of labels corresponding to each detection. If `labels` are not provided, corresponding `class_id` will be used as label. skip_label (bool): Is set to `True`, skips bounding box label annotation. Returns: np.ndarray: The image with the bounding boxes drawn on it Example: ```python import supervision as sv classes = ['person', ...] image = ... detections = sv.Detections(...) box_annotator = sv.BoxAnnotator() labels = [ f"{classes[class_id]} {confidence:0.2f}" for _, _, confidence, class_id, _ in detections ] annotated_frame = box_annotator.annotate( scene=image.copy(), detections=detections, labels=labels ) ``` """ font = cv2.FONT_HERSHEY_SIMPLEX for i in range(len(detections)): x1, y1, x2, y2 = detections.xyxy[i].astype(int) class_id = ( detections.class_id[i] if detections.class_id is not None else None ) idx = class_id if class_id is not None else i color = ( self.color.by_idx(idx) if isinstance(self.color, ColorPalette) else self.color ) cv2.rectangle( img=scene, pt1=(x1, y1), pt2=(x2, y2), color=color.as_bgr(), thickness=self.thickness, ) if skip_label: continue text = ( f"{class_id}" if (labels is None or len(detections) != len(labels)) else labels[i] ) text_width, text_height = cv2.getTextSize( text=text, fontFace=font, fontScale=self.text_scale, thickness=self.text_thickness, )[0] if not self.avoid_overlap: text_x = x1 + self.text_padding text_y = y1 - self.text_padding text_background_x1 = x1 text_background_y1 = y1 - 2 * self.text_padding - text_height text_background_x2 = x1 + 2 * self.text_padding + text_width text_background_y2 = y1 # text_x = x1 - self.text_padding - text_width # text_y = y1 + self.text_padding + text_height # text_background_x1 = x1 - 2 * self.text_padding - text_width # text_background_y1 = y1 # text_background_x2 = x1 # text_background_y2 = y1 + 2 * self.text_padding + text_height else: text_x, text_y, text_background_x1, text_background_y1, text_background_x2, text_background_y2 = get_optimal_label_pos(self.text_padding, text_width, text_height, x1, y1, x2, y2, detections, image_size) cv2.rectangle( img=scene, pt1=(text_background_x1, text_background_y1), pt2=(text_background_x2, text_background_y2), color=color.as_bgr(), thickness=cv2.FILLED, ) # import pdb; pdb.set_trace() box_color = color.as_rgb() luminance = 0.299 * box_color[0] + 0.587 * box_color[1] + 0.114 * box_color[2] text_color = (0,0,0) if luminance > 160 else (255,255,255) cv2.putText( img=scene, text=text, org=(text_x, text_y), fontFace=font, fontScale=self.text_scale, # color=self.text_color.as_rgb(), color=text_color, thickness=self.text_thickness, lineType=cv2.LINE_AA, ) return scene def box_area(box): return (box[2] - box[0]) * (box[3] - box[1]) def intersection_area(box1, box2): x1 = max(box1[0], box2[0]) y1 = max(box1[1], box2[1]) x2 = min(box1[2], box2[2]) y2 = min(box1[3], box2[3]) return max(0, x2 - x1) * max(0, y2 - y1) def IoU(box1, box2, return_max=True): intersection = intersection_area(box1, box2) union = box_area(box1) + box_area(box2) - intersection if box_area(box1) > 0 and box_area(box2) > 0: ratio1 = intersection / box_area(box1) ratio2 = intersection / box_area(box2) else: ratio1, ratio2 = 0, 0 if return_max: return max(intersection / union, ratio1, ratio2) else: return intersection / union def get_optimal_label_pos(text_padding, text_width, text_height, x1, y1, x2, y2, detections, image_size): """ check overlap of text and background detection box, and get_optimal_label_pos, pos: str, position of the text, must be one of 'top left', 'top right', 'outer left', 'outer right' TODO: if all are overlapping, return the last one, i.e. outer right Threshold: default to 0.3 """ def get_is_overlap(detections, text_background_x1, text_background_y1, text_background_x2, text_background_y2, image_size): is_overlap = False for i in range(len(detections)): detection = detections.xyxy[i].astype(int) if IoU([text_background_x1, text_background_y1, text_background_x2, text_background_y2], detection) > 0.3: is_overlap = True break # check if the text is out of the image if text_background_x1 < 0 or text_background_x2 > image_size[0] or text_background_y1 < 0 or text_background_y2 > image_size[1]: is_overlap = True return is_overlap # if pos == 'top left': text_x = x1 + text_padding text_y = y1 - text_padding text_background_x1 = x1 text_background_y1 = y1 - 2 * text_padding - text_height text_background_x2 = x1 + 2 * text_padding + text_width text_background_y2 = y1 is_overlap = get_is_overlap(detections, text_background_x1, text_background_y1, text_background_x2, text_background_y2, image_size) if not is_overlap: return text_x, text_y, text_background_x1, text_background_y1, text_background_x2, text_background_y2 # elif pos == 'outer left': text_x = x1 - text_padding - text_width text_y = y1 + text_padding + text_height text_background_x1 = x1 - 2 * text_padding - text_width text_background_y1 = y1 text_background_x2 = x1 text_background_y2 = y1 + 2 * text_padding + text_height is_overlap = get_is_overlap(detections, text_background_x1, text_background_y1, text_background_x2, text_background_y2, image_size) if not is_overlap: return text_x, text_y, text_background_x1, text_background_y1, text_background_x2, text_background_y2 # elif pos == 'outer right': text_x = x2 + text_padding text_y = y1 + text_padding + text_height text_background_x1 = x2 text_background_y1 = y1 text_background_x2 = x2 + 2 * text_padding + text_width text_background_y2 = y1 + 2 * text_padding + text_height is_overlap = get_is_overlap(detections, text_background_x1, text_background_y1, text_background_x2, text_background_y2, image_size) if not is_overlap: return text_x, text_y, text_background_x1, text_background_y1, text_background_x2, text_background_y2 # elif pos == 'top right': text_x = x2 - text_padding - text_width text_y = y1 - text_padding text_background_x1 = x2 - 2 * text_padding - text_width text_background_y1 = y1 - 2 * text_padding - text_height text_background_x2 = x2 text_background_y2 = y1 is_overlap = get_is_overlap(detections, text_background_x1, text_background_y1, text_background_x2, text_background_y2, image_size) if not is_overlap: return text_x, text_y, text_background_x1, text_background_y1, text_background_x2, text_background_y2 return text_x, text_y, text_background_x1, text_background_y1, text_background_x2, text_background_y2