SkalskiP commited on
Commit
fc7652c
1 Parent(s): 1858b2a

more tasks

Browse files
Files changed (3) hide show
  1. app.py +36 -8
  2. utils/models.py +27 -3
  3. utils/tasks.py +26 -6
app.py CHANGED
@@ -4,16 +4,19 @@ import gradio as gr
4
  import spaces
5
  import supervision as sv
6
  import torch
7
- from gradio_image_prompter import ImagePrompter
8
  from PIL import Image
 
9
 
10
  from utils.annotate import annotate_with_boxes
11
- from utils.models import load_models, run_inference, CHECKPOINTS
 
12
  from utils.tasks import TASK_NAMES, TASKS, OBJECT_DETECTION_TASK_NAME, \
13
- CAPTION_TASK_NAMES, CAPTION_TASK_NAME, DETAILED_CAPTION_TASK_NAME, \
14
  MORE_DETAILED_CAPTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME, OCR_TASK_NAME, \
15
  IMAGE_INPUT_TASK_NAMES, IMAGE_PROMPTER_INPUT_TASK_NAMES, IMAGE_OUTPUT_TASK_NAMES, \
16
- TEXTBOX_OUTPUT_TASK_NAMES
 
 
17
 
18
  MARKDOWN = """
19
  # Better Florence-2 Playground 🔥
@@ -43,12 +46,14 @@ processed by a transformer-based multi-modal encoder-decoder to generate the res
43
  """
44
  EXAMPLES = [
45
  ["microsoft/Florence-2-large-ft", OBJECT_DETECTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
 
 
46
  ["microsoft/Florence-2-large-ft", CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
47
  ["microsoft/Florence-2-large-ft", DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
48
  ["microsoft/Florence-2-large-ft", MORE_DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
49
  ["microsoft/Florence-2-large-ft", OCR_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg", None],
50
  ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg", None],
51
- ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/inference/license_plate_1.jpg", None]
52
  ]
53
 
54
  DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -67,17 +72,40 @@ def process(
67
  processor = PROCESSORS[checkpoint_dropdown]
68
  task = TASKS[task_dropdown]
69
 
70
- if task_dropdown in [OBJECT_DETECTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME]:
71
  _, response = run_inference(
72
  model, processor, DEVICE, image_input, task)
73
  detections = sv.Detections.from_lmm(
74
  lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
75
  return annotate_with_boxes(image_input, detections), None
76
- elif task_dropdown in CAPTION_TASK_NAMES or task_dropdown == OCR_TASK_NAME:
 
77
  _, response = run_inference(
78
  model, processor, DEVICE, image_input, task)
79
  return None, response[task]
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
  with gr.Blocks() as demo:
83
  gr.Markdown(MARKDOWN)
@@ -100,7 +128,7 @@ with gr.Blocks() as demo:
100
  image_input_component = gr.Image(
101
  type='pil', label='Upload image')
102
  image_prompter_input_component = ImagePrompter(
103
- type='pil', label='Upload image and draw box prompt', visible=False)
104
  submit_button_component = gr.Button(value='Submit', variant='primary')
105
 
106
  with gr.Column():
 
4
  import spaces
5
  import supervision as sv
6
  import torch
 
7
  from PIL import Image
8
+ from gradio_image_prompter import ImagePrompter
9
 
10
  from utils.annotate import annotate_with_boxes
11
+ from utils.models import load_models, run_inference, CHECKPOINTS, \
12
+ pre_process_region_task_input, post_process_region_output
13
  from utils.tasks import TASK_NAMES, TASKS, OBJECT_DETECTION_TASK_NAME, \
14
+ CAPTION_TASK_NAME, DETAILED_CAPTION_TASK_NAME, \
15
  MORE_DETAILED_CAPTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME, OCR_TASK_NAME, \
16
  IMAGE_INPUT_TASK_NAMES, IMAGE_PROMPTER_INPUT_TASK_NAMES, IMAGE_OUTPUT_TASK_NAMES, \
17
+ TEXTBOX_OUTPUT_TASK_NAMES, IMAGE_TO_IMAGE_TASK_NAMES, IMAGE_TO_TEXT_TASK_NAMES, \
18
+ IMAGE_PROMPT_TO_IMAGE_TASK_NAMES, REGION_PROPOSAL_TASK_NAME, \
19
+ DENSE_REGION_CAPTION_TASK_NAME
20
 
21
  MARKDOWN = """
22
  # Better Florence-2 Playground 🔥
 
46
  """
47
  EXAMPLES = [
48
  ["microsoft/Florence-2-large-ft", OBJECT_DETECTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
49
+ ["microsoft/Florence-2-large-ft", REGION_PROPOSAL_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", None],
50
+ ["microsoft/Florence-2-large-ft", DENSE_REGION_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", None],
51
  ["microsoft/Florence-2-large-ft", CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
52
  ["microsoft/Florence-2-large-ft", DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
53
  ["microsoft/Florence-2-large-ft", MORE_DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
54
  ["microsoft/Florence-2-large-ft", OCR_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg", None],
55
  ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg", None],
56
+ ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/inference/license_plate_1.jpg", None],
57
  ]
58
 
59
  DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
72
  processor = PROCESSORS[checkpoint_dropdown]
73
  task = TASKS[task_dropdown]
74
 
75
+ if task_dropdown in IMAGE_TO_IMAGE_TASK_NAMES:
76
  _, response = run_inference(
77
  model, processor, DEVICE, image_input, task)
78
  detections = sv.Detections.from_lmm(
79
  lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
80
  return annotate_with_boxes(image_input, detections), None
81
+
82
+ elif task_dropdown in IMAGE_TO_TEXT_TASK_NAMES:
83
  _, response = run_inference(
84
  model, processor, DEVICE, image_input, task)
85
  return None, response[task]
86
 
87
+ elif task_dropdown in IMAGE_PROMPT_TO_IMAGE_TASK_NAMES:
88
+ detections_list = []
89
+
90
+ print(image_prompter_input)
91
+
92
+ image_input = image_prompter_input["image"]
93
+ for prompt in image_prompter_input["points"]:
94
+ text = pre_process_region_task_input(
95
+ prompt=prompt,
96
+ resolution_wh=image_input.size
97
+ )
98
+ _, response = run_inference(
99
+ model, processor, DEVICE, image_input, task, text)
100
+ detections = sv.Detections.from_lmm(
101
+ lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
102
+ detections_list.append(detections)
103
+ detections = sv.Detections.merge(detections_list=detections_list)
104
+ detections = post_process_region_output(
105
+ detections=detections, resolution_wh=image_input.size)
106
+
107
+ return annotate_with_boxes(image_input, detections), None
108
+
109
 
110
  with gr.Blocks() as demo:
111
  gr.Markdown(MARKDOWN)
 
128
  image_input_component = gr.Image(
129
  type='pil', label='Upload image')
130
  image_prompter_input_component = ImagePrompter(
131
+ type='pil', label='Image prompt', visible=False)
132
  submit_button_component = gr.Button(value='Submit', variant='primary')
133
 
134
  with gr.Column():
utils/models.py CHANGED
@@ -1,8 +1,11 @@
1
- import torch
2
- from typing import Tuple, Dict, Any
3
- from transformers import AutoModelForCausalLM, AutoProcessor
4
  from unittest.mock import patch
 
 
 
 
5
  from PIL import Image
 
6
 
7
  from utils.imports import fixed_get_imports
8
 
@@ -47,3 +50,24 @@ def run_inference(
47
  response = processor.post_process_generation(
48
  generated_text, task=task, image_size=image.size)
49
  return generated_text, response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Tuple, Dict, Any, List
 
 
2
  from unittest.mock import patch
3
+
4
+ import numpy as np
5
+ import supervision as sv
6
+ import torch
7
  from PIL import Image
8
+ from transformers import AutoModelForCausalLM, AutoProcessor
9
 
10
  from utils.imports import fixed_get_imports
11
 
 
50
  response = processor.post_process_generation(
51
  generated_text, task=task, image_size=image.size)
52
  return generated_text, response
53
+
54
+
55
+ def pre_process_region_task_input(
56
+ prompt: List[float],
57
+ resolution_wh: Tuple[int, int]
58
+ ) -> str:
59
+ x1, y1, _, x2, y2, _ = prompt
60
+ w, h = resolution_wh
61
+ box = np.array([x1, y1, x2, y2])
62
+ box /= np.array([w, h, w, h])
63
+ box *= 1000
64
+ return "".join([f"<loc_{int(coordinate)}>" for coordinate in box])
65
+
66
+
67
+ def post_process_region_output(
68
+ detections: sv.Detections,
69
+ resolution_wh: Tuple[int, int]
70
+ ) -> sv.Detections:
71
+ w, h = resolution_wh
72
+ detections.xyxy = (detections.xyxy / 1000 * np.array([w, h, w, h])).astype(np.int32)
73
+ return detections
utils/tasks.py CHANGED
@@ -1,4 +1,6 @@
1
  OBJECT_DETECTION_TASK_NAME = "Object Detection"
 
 
2
  CAPTION_TASK_NAME = "Caption"
3
  DETAILED_CAPTION_TASK_NAME = "Detailed Caption"
4
  MORE_DETAILED_CAPTION_TASK_NAME = "More Detailed Caption"
@@ -9,6 +11,8 @@ REGION_TO_DESCRIPTION_TASK_NAME = "Region to Description"
9
 
10
  TASK_NAMES = [
11
  OBJECT_DETECTION_TASK_NAME,
 
 
12
  CAPTION_TASK_NAME,
13
  DETAILED_CAPTION_TASK_NAME,
14
  MORE_DETAILED_CAPTION_TASK_NAME,
@@ -19,6 +23,8 @@ TASK_NAMES = [
19
  ]
20
  TASKS = {
21
  OBJECT_DETECTION_TASK_NAME: "<OD>",
 
 
22
  CAPTION_TASK_NAME: "<CAPTION>",
23
  DETAILED_CAPTION_TASK_NAME: "<DETAILED_CAPTION>",
24
  MORE_DETAILED_CAPTION_TASK_NAME: "<MORE_DETAILED_CAPTION>",
@@ -27,14 +33,10 @@ TASKS = {
27
  REGION_TO_CATEGORY_TASK_NAME: "<REGION_TO_CATEGORY>",
28
  REGION_TO_DESCRIPTION_TASK_NAME: "<REGION_TO_DESCRIPTION>"
29
  }
30
- CAPTION_TASK_NAMES = [
31
- CAPTION_TASK_NAME,
32
- DETAILED_CAPTION_TASK_NAME,
33
- MORE_DETAILED_CAPTION_TASK_NAME
34
- ]
35
-
36
  IMAGE_INPUT_TASK_NAMES = [
37
  OBJECT_DETECTION_TASK_NAME,
 
 
38
  CAPTION_TASK_NAME,
39
  DETAILED_CAPTION_TASK_NAME,
40
  MORE_DETAILED_CAPTION_TASK_NAME,
@@ -47,6 +49,8 @@ IMAGE_PROMPTER_INPUT_TASK_NAMES = [
47
  ]
48
  IMAGE_OUTPUT_TASK_NAMES = [
49
  OBJECT_DETECTION_TASK_NAME,
 
 
50
  OCR_WITH_REGION_TASK_NAME,
51
  REGION_TO_CATEGORY_TASK_NAME,
52
  REGION_TO_DESCRIPTION_TASK_NAME
@@ -56,4 +60,20 @@ TEXTBOX_OUTPUT_TASK_NAMES = [
56
  DETAILED_CAPTION_TASK_NAME,
57
  MORE_DETAILED_CAPTION_TASK_NAME,
58
  OCR_TASK_NAME
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  ]
 
1
  OBJECT_DETECTION_TASK_NAME = "Object Detection"
2
+ REGION_PROPOSAL_TASK_NAME = "Region Proposal"
3
+ DENSE_REGION_CAPTION_TASK_NAME = "Dense Region Caption"
4
  CAPTION_TASK_NAME = "Caption"
5
  DETAILED_CAPTION_TASK_NAME = "Detailed Caption"
6
  MORE_DETAILED_CAPTION_TASK_NAME = "More Detailed Caption"
 
11
 
12
  TASK_NAMES = [
13
  OBJECT_DETECTION_TASK_NAME,
14
+ REGION_PROPOSAL_TASK_NAME,
15
+ DENSE_REGION_CAPTION_TASK_NAME,
16
  CAPTION_TASK_NAME,
17
  DETAILED_CAPTION_TASK_NAME,
18
  MORE_DETAILED_CAPTION_TASK_NAME,
 
23
  ]
24
  TASKS = {
25
  OBJECT_DETECTION_TASK_NAME: "<OD>",
26
+ REGION_PROPOSAL_TASK_NAME: "<REGION_PROPOSAL>",
27
+ DENSE_REGION_CAPTION_TASK_NAME: "<DENSE_REGION_CAPTION>",
28
  CAPTION_TASK_NAME: "<CAPTION>",
29
  DETAILED_CAPTION_TASK_NAME: "<DETAILED_CAPTION>",
30
  MORE_DETAILED_CAPTION_TASK_NAME: "<MORE_DETAILED_CAPTION>",
 
33
  REGION_TO_CATEGORY_TASK_NAME: "<REGION_TO_CATEGORY>",
34
  REGION_TO_DESCRIPTION_TASK_NAME: "<REGION_TO_DESCRIPTION>"
35
  }
 
 
 
 
 
 
36
  IMAGE_INPUT_TASK_NAMES = [
37
  OBJECT_DETECTION_TASK_NAME,
38
+ REGION_PROPOSAL_TASK_NAME,
39
+ DENSE_REGION_CAPTION_TASK_NAME,
40
  CAPTION_TASK_NAME,
41
  DETAILED_CAPTION_TASK_NAME,
42
  MORE_DETAILED_CAPTION_TASK_NAME,
 
49
  ]
50
  IMAGE_OUTPUT_TASK_NAMES = [
51
  OBJECT_DETECTION_TASK_NAME,
52
+ REGION_PROPOSAL_TASK_NAME,
53
+ DENSE_REGION_CAPTION_TASK_NAME,
54
  OCR_WITH_REGION_TASK_NAME,
55
  REGION_TO_CATEGORY_TASK_NAME,
56
  REGION_TO_DESCRIPTION_TASK_NAME
 
60
  DETAILED_CAPTION_TASK_NAME,
61
  MORE_DETAILED_CAPTION_TASK_NAME,
62
  OCR_TASK_NAME
63
+ ]
64
+ IMAGE_TO_IMAGE_TASK_NAMES = [
65
+ OBJECT_DETECTION_TASK_NAME,
66
+ OCR_WITH_REGION_TASK_NAME,
67
+ REGION_PROPOSAL_TASK_NAME,
68
+ DENSE_REGION_CAPTION_TASK_NAME
69
+ ]
70
+ IMAGE_TO_TEXT_TASK_NAMES = [
71
+ CAPTION_TASK_NAME,
72
+ DETAILED_CAPTION_TASK_NAME,
73
+ MORE_DETAILED_CAPTION_TASK_NAME,
74
+ OCR_TASK_NAME
75
+ ]
76
+ IMAGE_PROMPT_TO_IMAGE_TASK_NAMES = [
77
+ REGION_TO_CATEGORY_TASK_NAME,
78
+ REGION_TO_DESCRIPTION_TASK_NAME
79
  ]