Spaces:

SkalskiP
/

better-florence-2

Running on Zero

App Files Files Community

SkalskiP commited on Jul 9

Commit

fc7652c

•

1 Parent(s): 1858b2a

more tasks

Browse files

Files changed (3) hide show

app.py +36 -8
utils/models.py +27 -3
utils/tasks.py +26 -6

app.py CHANGED Viewed

@@ -4,16 +4,19 @@ import gradio as gr
 import spaces
 import supervision as sv
 import torch
-from gradio_image_prompter import ImagePrompter
 from PIL import Image
 from utils.annotate import annotate_with_boxes
-from utils.models import load_models, run_inference, CHECKPOINTS
 from utils.tasks import TASK_NAMES, TASKS, OBJECT_DETECTION_TASK_NAME, \
-    CAPTION_TASK_NAMES, CAPTION_TASK_NAME, DETAILED_CAPTION_TASK_NAME, \
     MORE_DETAILED_CAPTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME, OCR_TASK_NAME, \
     IMAGE_INPUT_TASK_NAMES, IMAGE_PROMPTER_INPUT_TASK_NAMES, IMAGE_OUTPUT_TASK_NAMES, \
-    TEXTBOX_OUTPUT_TASK_NAMES
 MARKDOWN = """
 # Better Florence-2 Playground 🔥
@@ -43,12 +46,14 @@ processed by a transformer-based multi-modal encoder-decoder to generate the res
 """
 EXAMPLES = [
     ["microsoft/Florence-2-large-ft", OBJECT_DETECTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
     ["microsoft/Florence-2-large-ft", CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
     ["microsoft/Florence-2-large-ft", DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
     ["microsoft/Florence-2-large-ft", MORE_DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
     ["microsoft/Florence-2-large-ft", OCR_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg", None],
     ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg", None],
-    ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/inference/license_plate_1.jpg", None]
 ]
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -67,17 +72,40 @@ def process(
     processor = PROCESSORS[checkpoint_dropdown]
     task = TASKS[task_dropdown]
-    if task_dropdown in [OBJECT_DETECTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME]:
         _, response = run_inference(
             model, processor, DEVICE, image_input, task)
         detections = sv.Detections.from_lmm(
             lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
         return annotate_with_boxes(image_input, detections), None
-    elif task_dropdown in CAPTION_TASK_NAMES or task_dropdown == OCR_TASK_NAME:
         _, response = run_inference(
             model, processor, DEVICE, image_input, task)
         return None, response[task]
 with gr.Blocks() as demo:
     gr.Markdown(MARKDOWN)
@@ -100,7 +128,7 @@ with gr.Blocks() as demo:
             image_input_component = gr.Image(
                 type='pil', label='Upload image')
             image_prompter_input_component = ImagePrompter(
-                type='pil', label='Upload image and draw box prompt', visible=False)
             submit_button_component = gr.Button(value='Submit', variant='primary')
         with gr.Column():

 import spaces
 import supervision as sv
 import torch
 from PIL import Image
+from gradio_image_prompter import ImagePrompter
 from utils.annotate import annotate_with_boxes
+from utils.models import load_models, run_inference, CHECKPOINTS, \
+    pre_process_region_task_input, post_process_region_output
 from utils.tasks import TASK_NAMES, TASKS, OBJECT_DETECTION_TASK_NAME, \
+    CAPTION_TASK_NAME, DETAILED_CAPTION_TASK_NAME, \
     MORE_DETAILED_CAPTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME, OCR_TASK_NAME, \
     IMAGE_INPUT_TASK_NAMES, IMAGE_PROMPTER_INPUT_TASK_NAMES, IMAGE_OUTPUT_TASK_NAMES, \
+    TEXTBOX_OUTPUT_TASK_NAMES, IMAGE_TO_IMAGE_TASK_NAMES, IMAGE_TO_TEXT_TASK_NAMES, \
+    IMAGE_PROMPT_TO_IMAGE_TASK_NAMES, REGION_PROPOSAL_TASK_NAME, \
+    DENSE_REGION_CAPTION_TASK_NAME
 MARKDOWN = """
 # Better Florence-2 Playground 🔥
 """
 EXAMPLES = [
     ["microsoft/Florence-2-large-ft", OBJECT_DETECTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
+    ["microsoft/Florence-2-large-ft", REGION_PROPOSAL_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", None],
+    ["microsoft/Florence-2-large-ft", DENSE_REGION_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", None],
     ["microsoft/Florence-2-large-ft", CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
     ["microsoft/Florence-2-large-ft", DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
     ["microsoft/Florence-2-large-ft", MORE_DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
     ["microsoft/Florence-2-large-ft", OCR_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg", None],
     ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg", None],
+    ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/inference/license_plate_1.jpg", None],
 ]
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     processor = PROCESSORS[checkpoint_dropdown]
     task = TASKS[task_dropdown]
+    if task_dropdown in IMAGE_TO_IMAGE_TASK_NAMES:
         _, response = run_inference(
             model, processor, DEVICE, image_input, task)
         detections = sv.Detections.from_lmm(
             lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
         return annotate_with_boxes(image_input, detections), None
+    elif task_dropdown in IMAGE_TO_TEXT_TASK_NAMES:
         _, response = run_inference(
             model, processor, DEVICE, image_input, task)
         return None, response[task]
+    elif task_dropdown in IMAGE_PROMPT_TO_IMAGE_TASK_NAMES:
+        detections_list = []
+        print(image_prompter_input)
+        image_input = image_prompter_input["image"]
+        for prompt in image_prompter_input["points"]:
+            text = pre_process_region_task_input(
+                prompt=prompt,
+                resolution_wh=image_input.size
+            )
+            _, response = run_inference(
+                model, processor, DEVICE, image_input, task, text)
+            detections = sv.Detections.from_lmm(
+                lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
+            detections_list.append(detections)
+        detections = sv.Detections.merge(detections_list=detections_list)
+        detections = post_process_region_output(
+            detections=detections, resolution_wh=image_input.size)
+        return annotate_with_boxes(image_input, detections), None
 with gr.Blocks() as demo:
     gr.Markdown(MARKDOWN)
             image_input_component = gr.Image(
                 type='pil', label='Upload image')
             image_prompter_input_component = ImagePrompter(
+                type='pil', label='Image prompt', visible=False)
             submit_button_component = gr.Button(value='Submit', variant='primary')
         with gr.Column():

utils/models.py CHANGED Viewed

@@ -1,8 +1,11 @@
-import torch
-from typing import Tuple, Dict, Any
-from transformers import AutoModelForCausalLM, AutoProcessor
 from unittest.mock import patch
 from PIL import Image
 from utils.imports import fixed_get_imports
@@ -47,3 +50,24 @@ def run_inference(
     response = processor.post_process_generation(
         generated_text, task=task, image_size=image.size)
     return generated_text, response

+from typing import Tuple, Dict, Any, List
 from unittest.mock import patch
+import numpy as np
+import supervision as sv
+import torch
 from PIL import Image
+from transformers import AutoModelForCausalLM, AutoProcessor
 from utils.imports import fixed_get_imports
     response = processor.post_process_generation(
         generated_text, task=task, image_size=image.size)
     return generated_text, response
+def pre_process_region_task_input(
+    prompt: List[float],
+    resolution_wh: Tuple[int, int]
+) -> str:
+    x1, y1, _, x2, y2, _ = prompt
+    w, h = resolution_wh
+    box = np.array([x1, y1, x2, y2])
+    box /= np.array([w, h, w, h])
+    box *= 1000
+    return "".join([f"<loc_{int(coordinate)}>" for coordinate in box])
+def post_process_region_output(
+    detections: sv.Detections,
+    resolution_wh: Tuple[int, int]
+) -> sv.Detections:
+    w, h = resolution_wh
+    detections.xyxy = (detections.xyxy / 1000 * np.array([w, h, w, h])).astype(np.int32)
+    return detections

utils/tasks.py CHANGED Viewed

@@ -1,4 +1,6 @@
 OBJECT_DETECTION_TASK_NAME = "Object Detection"
 CAPTION_TASK_NAME = "Caption"
 DETAILED_CAPTION_TASK_NAME = "Detailed Caption"
 MORE_DETAILED_CAPTION_TASK_NAME = "More Detailed Caption"
@@ -9,6 +11,8 @@ REGION_TO_DESCRIPTION_TASK_NAME = "Region to Description"
 TASK_NAMES = [
     OBJECT_DETECTION_TASK_NAME,
     CAPTION_TASK_NAME,
     DETAILED_CAPTION_TASK_NAME,
     MORE_DETAILED_CAPTION_TASK_NAME,
@@ -19,6 +23,8 @@ TASK_NAMES = [
 ]
 TASKS = {
     OBJECT_DETECTION_TASK_NAME: "<OD>",
     CAPTION_TASK_NAME: "<CAPTION>",
     DETAILED_CAPTION_TASK_NAME: "<DETAILED_CAPTION>",
     MORE_DETAILED_CAPTION_TASK_NAME: "<MORE_DETAILED_CAPTION>",
@@ -27,14 +33,10 @@ TASKS = {
     REGION_TO_CATEGORY_TASK_NAME: "<REGION_TO_CATEGORY>",
     REGION_TO_DESCRIPTION_TASK_NAME: "<REGION_TO_DESCRIPTION>"
 }
-CAPTION_TASK_NAMES = [
-    CAPTION_TASK_NAME,
-    DETAILED_CAPTION_TASK_NAME,
-    MORE_DETAILED_CAPTION_TASK_NAME
-]
 IMAGE_INPUT_TASK_NAMES = [
     OBJECT_DETECTION_TASK_NAME,
     CAPTION_TASK_NAME,
     DETAILED_CAPTION_TASK_NAME,
     MORE_DETAILED_CAPTION_TASK_NAME,
@@ -47,6 +49,8 @@ IMAGE_PROMPTER_INPUT_TASK_NAMES = [
 ]
 IMAGE_OUTPUT_TASK_NAMES = [
     OBJECT_DETECTION_TASK_NAME,
     OCR_WITH_REGION_TASK_NAME,
     REGION_TO_CATEGORY_TASK_NAME,
     REGION_TO_DESCRIPTION_TASK_NAME
@@ -56,4 +60,20 @@ TEXTBOX_OUTPUT_TASK_NAMES = [
     DETAILED_CAPTION_TASK_NAME,
     MORE_DETAILED_CAPTION_TASK_NAME,
     OCR_TASK_NAME
 ]

 OBJECT_DETECTION_TASK_NAME = "Object Detection"
+REGION_PROPOSAL_TASK_NAME = "Region Proposal"
+DENSE_REGION_CAPTION_TASK_NAME = "Dense Region Caption"
 CAPTION_TASK_NAME = "Caption"
 DETAILED_CAPTION_TASK_NAME = "Detailed Caption"
 MORE_DETAILED_CAPTION_TASK_NAME = "More Detailed Caption"
 TASK_NAMES = [
     OBJECT_DETECTION_TASK_NAME,
+    REGION_PROPOSAL_TASK_NAME,
+    DENSE_REGION_CAPTION_TASK_NAME,
     CAPTION_TASK_NAME,
     DETAILED_CAPTION_TASK_NAME,
     MORE_DETAILED_CAPTION_TASK_NAME,
 ]
 TASKS = {
     OBJECT_DETECTION_TASK_NAME: "<OD>",
+    REGION_PROPOSAL_TASK_NAME: "<REGION_PROPOSAL>",
+    DENSE_REGION_CAPTION_TASK_NAME: "<DENSE_REGION_CAPTION>",
     CAPTION_TASK_NAME: "<CAPTION>",
     DETAILED_CAPTION_TASK_NAME: "<DETAILED_CAPTION>",
     MORE_DETAILED_CAPTION_TASK_NAME: "<MORE_DETAILED_CAPTION>",
     REGION_TO_CATEGORY_TASK_NAME: "<REGION_TO_CATEGORY>",
     REGION_TO_DESCRIPTION_TASK_NAME: "<REGION_TO_DESCRIPTION>"
 }
 IMAGE_INPUT_TASK_NAMES = [
     OBJECT_DETECTION_TASK_NAME,
+    REGION_PROPOSAL_TASK_NAME,
+    DENSE_REGION_CAPTION_TASK_NAME,
     CAPTION_TASK_NAME,
     DETAILED_CAPTION_TASK_NAME,
     MORE_DETAILED_CAPTION_TASK_NAME,
 ]
 IMAGE_OUTPUT_TASK_NAMES = [
     OBJECT_DETECTION_TASK_NAME,
+    REGION_PROPOSAL_TASK_NAME,
+    DENSE_REGION_CAPTION_TASK_NAME,
     OCR_WITH_REGION_TASK_NAME,
     REGION_TO_CATEGORY_TASK_NAME,
     REGION_TO_DESCRIPTION_TASK_NAME
     DETAILED_CAPTION_TASK_NAME,
     MORE_DETAILED_CAPTION_TASK_NAME,
     OCR_TASK_NAME
+]
+IMAGE_TO_IMAGE_TASK_NAMES = [
+    OBJECT_DETECTION_TASK_NAME,
+    OCR_WITH_REGION_TASK_NAME,
+    REGION_PROPOSAL_TASK_NAME,
+    DENSE_REGION_CAPTION_TASK_NAME
+]
+IMAGE_TO_TEXT_TASK_NAMES = [
+    CAPTION_TASK_NAME,
+    DETAILED_CAPTION_TASK_NAME,
+    MORE_DETAILED_CAPTION_TASK_NAME,
+    OCR_TASK_NAME
+]
+IMAGE_PROMPT_TO_IMAGE_TASK_NAMES = [
+    REGION_TO_CATEGORY_TASK_NAME,
+    REGION_TO_DESCRIPTION_TASK_NAME
 ]