# Credits to IDEA Research for the model: # https://huggingface.co/IDEA-Research/grounding-dino-tiny from base64 import b64decode from io import BytesIO import gradio as gr import spaces from PIL import Image import torch from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection model_id = "IDEA-Research/grounding-dino-tiny" device = "cuda" if torch.cuda.is_available() else "cpu" processor = AutoProcessor.from_pretrained(model_id) model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device) def predict(base64: str, queries: str, box_threshold: float, text_threshold: float): decoded_img = b64decode(base64) image_stream = BytesIO(decoded_img) image = Image.open(image_stream) inputs = processor(images=image, text=queries, return_tensors="pt").to(device) with torch.no_grad(): outputs = model(**inputs) results = processor.post_process_grounded_object_detection( outputs, inputs.input_ids, box_threshold=box_threshold, text_threshold=text_threshold, target_sizes=[image.size[::-1]] ) fmt_results = { "scores": [float(s) for s in results[0]["scores"]], "labels": results[0]["labels"], "boxes": [[float(x) for x in box] for box in results[0]["boxes"]] } print(fmt_results) return fmt_results demo = gr.Interface( fn=predict, inputs=[ gr.Text(label="Image (B64)"), gr.Text(label="Queries, in lowercase, separated by full stop", placeholder="a bird. a blue bird."), gr.Number(label="box_threshold", value=0.4), gr.Number(label="text_threshold", value=0.3) ], outputs=gr.JSON(label="Predictions"), ) demo.launch()