from typing import Tuple, Optional import gradio as gr import spaces import supervision as sv import torch from PIL import Image from gradio_image_prompter import ImagePrompter from utils.annotate import annotate_with_boxes from utils.models import load_models, run_inference, CHECKPOINTS, \ pre_process_region_task_input, post_process_region_output from utils.tasks import TASK_NAMES, TASKS, OBJECT_DETECTION_TASK_NAME, \ CAPTION_TASK_NAME, DETAILED_CAPTION_TASK_NAME, \ MORE_DETAILED_CAPTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME, OCR_TASK_NAME, \ IMAGE_INPUT_TASK_NAMES, IMAGE_PROMPTER_INPUT_TASK_NAMES, IMAGE_OUTPUT_TASK_NAMES, \ TEXTBOX_OUTPUT_TASK_NAMES, IMAGE_TO_IMAGE_TASK_NAMES, IMAGE_TO_TEXT_TASK_NAMES, \ IMAGE_PROMPT_TO_IMAGE_TASK_NAMES, REGION_PROPOSAL_TASK_NAME, \ DENSE_REGION_CAPTION_TASK_NAME MARKDOWN = """ # Better Florence-2 Playground 🔥
Colab Roboflow arXiv YouTube
Florence-2 is a lightweight vision-language model open-sourced by Microsoft under the MIT license. The model demonstrates strong zero-shot and fine-tuning capabilities across tasks such as captioning, object detection, grounding, and segmentation. The model takes images and task prompts as input, generating the desired results in text format. It uses a DaViT vision encoder to convert images into visual token embeddings. These are then concatenated with BERT-generated text embeddings and processed by a transformer-based multi-modal encoder-decoder to generate the response. """ EXAMPLES = [ ["microsoft/Florence-2-large-ft", OBJECT_DETECTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None], ["microsoft/Florence-2-large-ft", REGION_PROPOSAL_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", None], ["microsoft/Florence-2-large-ft", DENSE_REGION_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", None], ["microsoft/Florence-2-large-ft", CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None], ["microsoft/Florence-2-large-ft", DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None], ["microsoft/Florence-2-large-ft", MORE_DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None], ["microsoft/Florence-2-large-ft", OCR_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg", None], ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg", None], ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/inference/license_plate_1.jpg", None], ] # DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") DEVICE = "cuda" MODELS, PROCESSORS = load_models(DEVICE) @spaces.GPU def process( checkpoint_dropdown, task_dropdown, image_input, image_prompter_input ) -> Tuple[Optional[Image.Image], Optional[str]]: model = MODELS[checkpoint_dropdown] processor = PROCESSORS[checkpoint_dropdown] task = TASKS[task_dropdown] if task_dropdown in IMAGE_TO_IMAGE_TASK_NAMES: _, response = run_inference( model, processor, DEVICE, image_input, task) detections = sv.Detections.from_lmm( lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size) return annotate_with_boxes(image_input, detections), None elif task_dropdown in IMAGE_TO_TEXT_TASK_NAMES: _, response = run_inference( model, processor, DEVICE, image_input, task) return None, response[task] elif task_dropdown in IMAGE_PROMPT_TO_IMAGE_TASK_NAMES: detections_list = [] print(image_prompter_input) image_input = image_prompter_input["image"] for prompt in image_prompter_input["points"]: text = pre_process_region_task_input( prompt=prompt, resolution_wh=image_input.size ) _, response = run_inference( model, processor, DEVICE, image_input, task, text) detections = sv.Detections.from_lmm( lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size) detections_list.append(detections) detections = sv.Detections.merge(detections_list=detections_list) detections = post_process_region_output( detections=detections, resolution_wh=image_input.size) return annotate_with_boxes(image_input, detections), None with gr.Blocks() as demo: gr.Markdown(MARKDOWN) with gr.Row(): checkpoint_dropdown_component = gr.Dropdown( choices=CHECKPOINTS, value=CHECKPOINTS[0], label="Model", info="Select a Florence 2 model to use.", interactive=True ) task_dropdown_component = gr.Dropdown( choices=TASK_NAMES, value=TASK_NAMES[0], label="Task", info="Select a task to perform with the model.", interactive=True ) with gr.Row(): with gr.Column(): image_input_component = gr.Image( type='pil', label='Upload image') image_prompter_input_component = ImagePrompter( type='pil', label='Image prompt', visible=False) submit_button_component = gr.Button(value='Submit', variant='primary') with gr.Column(): image_output_component = gr.Image(type='pil', label='Image Output') text_output_component = gr.Textbox(label='Caption Output', visible=False) with gr.Row(): gr.Examples( fn=process, examples=EXAMPLES, inputs=[ checkpoint_dropdown_component, task_dropdown_component, image_input_component, image_prompter_input_component ], outputs=[ image_output_component, text_output_component ], run_on_click=True ) def on_dropdown_change(text): return [ gr.Image(visible=text in IMAGE_INPUT_TASK_NAMES), ImagePrompter(visible=text in IMAGE_PROMPTER_INPUT_TASK_NAMES), gr.Image(visible=text in IMAGE_OUTPUT_TASK_NAMES), gr.Textbox(visible=text in TEXTBOX_OUTPUT_TASK_NAMES) ] task_dropdown_component.change( on_dropdown_change, inputs=[task_dropdown_component], outputs=[ image_input_component, image_prompter_input_component, image_output_component, text_output_component ] ) submit_button_component.click( fn=process, inputs=[ checkpoint_dropdown_component, task_dropdown_component, image_input_component, image_prompter_input_component ], outputs=[ image_output_component, text_output_component ] ) demo.launch(debug=False, show_error=True, max_threads=1)