""" Gradio app for pollen-vision This script creates a Gradio app for pollen-vision. The app allows users to perform object detection and object segmentation using the OWL-ViT and MobileSAM models. """ from datasets import load_dataset import gradio as gr import numpy as np import numpy.typing as npt from typing import Any, Dict, List from pollen_vision.vision_models.object_detection import OwlVitWrapper from pollen_vision.vision_models.object_segmentation import MobileSamWrapper from pollen_vision.vision_models.utils import Annotator, get_bboxes owl_vit = OwlVitWrapper() mobile_sam = MobileSamWrapper() annotator = Annotator() def object_detection( img: npt.NDArray[np.uint8], text_queries: List[str], score_threshold: float ) -> List[Dict[str, Any]]: predictions: List[Dict[str, Any]] = owl_vit.infer( im=img, candidate_labels=text_queries, detection_threshold=score_threshold ) return predictions def object_segmentation( img: npt.NDArray[np.uint8], object_detection_predictions: List[Dict[str, Any]] ) -> List[npt.NDArray[np.uint8]]: bboxes = get_bboxes(predictions=object_detection_predictions) masks: List[npt.NDArray[np.uint8]] = mobile_sam.infer(im=img, bboxes=bboxes) return masks def query( task: str, img: npt.NDArray[np.uint8], text_queries: List[str], score_threshold: float, ) -> npt.NDArray[np.uint8]: object_detection_predictions = object_detection( img=img, text_queries=text_queries, score_threshold=score_threshold ) if task == "Object detection + segmentation (OWL-ViT + MobileSAM)": masks = object_segmentation( img=img, object_detection_predictions=object_detection_predictions ) img = annotator.annotate( im=img, detection_predictions=object_detection_predictions, masks=masks ) return img img = annotator.annotate(im=img, detection_predictions=object_detection_predictions) return img description = """ Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam nec purus et nunc tincidunt tincidunt. """ demo_inputs = [ gr.Dropdown( [ "Object detection (OWL-ViT)", "Object detection + segmentation (OWL-ViT + MobileSAM)", ], label="Choose a task", value="Object detection (OWL-ViT)", ), gr.Image(), "text", gr.Slider(0, 1, value=0.1), ] rdt_dataset = load_dataset("pollen-robotics/reachy-doing-things", split="train") img_kitchen_detection = rdt_dataset[11]["image"] img_kitchen_segmentation = rdt_dataset[12]["image"] demo_examples = [ [ "Object detection (OWL-ViT)", img_kitchen_detection, ["kettle", "black mug", "sink", "blue mug", "sponge", "bag of chips"], 0.15, ], [ "Object detection + segmentation (OWL-ViT + MobileSAM)", img_kitchen_segmentation, ["blue mug", "paper cup", "kettle", "sponge"], 0.12, ], ] demo = gr.Interface( fn=query, inputs=demo_inputs, outputs="image", title="pollen-vision", description=description, examples=demo_examples, ) demo.launch()