nph4rd commited on
Commit
c29ac1a
1 Parent(s): 4019c00

Create app.py

Browse files

add egs

update

remove seg

This view is limited to 50 files because it contains too many changes.   See raw diff
airbnb.jpg ADDED
app.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import PIL.Image
3
+ import transformers
4
+ from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor
5
+ import torch
6
+ import os
7
+ import string
8
+ import functools
9
+ import re
10
+ import numpy as np
11
+ import spaces
12
+
13
+
14
+ model_id = "agentsea/paligemma-3b-ft-widgetcap-waveui-448"
15
+ processor_id = "google/paligemma-3b-pt-448"
16
+ COLORS = ['#4285f4', '#db4437', '#f4b400', '#0f9d58', '#e48ef1']
17
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
+ model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).eval().to(device)
19
+ processor = PaliGemmaProcessor.from_pretrained(processor_id)
20
+
21
+ ###### Transformers Inference
22
+ @spaces.GPU
23
+ def infer(
24
+ image: PIL.Image.Image,
25
+ text: str,
26
+ max_new_tokens: int
27
+ ) -> str:
28
+ inputs = processor(text=text, images=image, return_tensors="pt").to(device)
29
+ with torch.inference_mode():
30
+ generated_ids = model.generate(
31
+ **inputs,
32
+ max_new_tokens=max_new_tokens,
33
+ do_sample=False
34
+ )
35
+ result = processor.batch_decode(generated_ids, skip_special_tokens=True)
36
+ return result[0][len(text):].lstrip("\n")
37
+
38
+ def parse_segmentation(input_image, input_text):
39
+ out = infer(input_image, input_text, max_new_tokens=100)
40
+ objs = extract_objs(out.lstrip("\n"), input_image.size[0], input_image.size[1], unique_labels=True)
41
+ labels = set(obj.get('name') for obj in objs if obj.get('name'))
42
+ color_map = {l: COLORS[i % len(COLORS)] for i, l in enumerate(labels)}
43
+ highlighted_text = [(obj['content'], obj.get('name')) for obj in objs]
44
+ annotated_img = (
45
+ input_image,
46
+ [
47
+ (
48
+ obj['mask'] if obj.get('mask') is not None else obj['xyxy'],
49
+ obj['name'] or '',
50
+ )
51
+ for obj in objs
52
+ if 'mask' in obj or 'xyxy' in obj
53
+ ],
54
+ )
55
+ has_annotations = bool(annotated_img[1])
56
+ return annotated_img
57
+
58
+ ######## Demo
59
+
60
+ INTRO_TEXT = """## PaliGemma WaveUI\n\n
61
+ Bla bla
62
+ """
63
+
64
+
65
+ with gr.Blocks(css="style.css") as demo:
66
+ gr.Markdown(INTRO_TEXT)
67
+ with gr.Tab("Detection"):
68
+ image = gr.Image(type="pil")
69
+ seg_input = gr.Text(label="Entities to Detect")
70
+ seg_btn = gr.Button("Submit")
71
+ annotated_image = gr.AnnotatedImage(label="Output")
72
+
73
+ examples = [["./airbnb.jpg", "detect 'Amazing pools' button"]]
74
+ gr.Markdown("Example images are licensed CC0 by [akolesnikoff@](https://github.com/akolesnikoff), [mbosnjak@](https://github.com/mbosnjak), [maximneumann@](https://github.com/maximneumann) and [merve](https://huggingface.co/merve).")
75
+ gr.Examples(
76
+ examples=examples,
77
+ inputs=[image, seg_input],
78
+ )
79
+
80
+ seg_inputs = [
81
+ image,
82
+ seg_input
83
+ ]
84
+ seg_outputs = [
85
+ annotated_image
86
+ ]
87
+ seg_btn.click(
88
+ fn=parse_segmentation,
89
+ inputs=seg_inputs,
90
+ outputs=seg_outputs,
91
+ )
92
+
93
+
94
+ _SEGMENT_DETECT_RE = re.compile(
95
+ r'(.*?)' +
96
+ r'<loc(\d{4})>' * 4 + r'\s*' +
97
+ '(?:%s)?' % (r'<seg(\d{3})>' * 16) +
98
+ r'\s*([^;<>]+)? ?(?:; )?',
99
+ )
100
+
101
+ def extract_objs(text, width, height, unique_labels=False):
102
+ """Returns objs for a string with "<loc>" and "<seg>" tokens."""
103
+ objs = []
104
+ seen = set()
105
+ while text:
106
+ m = _SEGMENT_DETECT_RE.match(text)
107
+ if not m:
108
+ break
109
+ print("m", m)
110
+ gs = list(m.groups())
111
+ before = gs.pop(0)
112
+ name = gs.pop()
113
+ y1, x1, y2, x2 = [int(x) / 1024 for x in gs[:4]]
114
+
115
+ y1, x1, y2, x2 = map(round, (y1*height, x1*width, y2*height, x2*width))
116
+ mask = None
117
+
118
+ content = m.group()
119
+ if before:
120
+ objs.append(dict(content=before))
121
+ content = content[len(before):]
122
+ while unique_labels and name in seen:
123
+ name = (name or '') + "'"
124
+ seen.add(name)
125
+ objs.append(dict(
126
+ content=content, xyxy=(x1, y1, x2, y2), mask=mask, name=name))
127
+ text = text[len(before) + len(content):]
128
+
129
+ if text:
130
+ objs.append(dict(content=text))
131
+
132
+ return objs
133
+
134
+ #########
135
+
136
+ if __name__ == "__main__":
137
+ demo.queue(max_size=10).launch(debug=True)
examples/barsik.jpg ADDED
examples/barsik.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "barsik",
3
+ "comment": "",
4
+ "model": "paligemma-3b-mix-224",
5
+ "prompt": "segment cat",
6
+ "license": "CC0 by [maximneumann@](https://github.com/maximneumann)"
7
+ }
examples/biennale.jpg ADDED
examples/biennale.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "biennale",
3
+ "comment": "",
4
+ "model": "paligemma-3b-mix-224",
5
+ "prompt": "In which city is this?",
6
+ "license": "CC0 by [andsteing@](https://huggingface.co/andsteing)"
7
+ }
examples/billard1.jpg ADDED
examples/billard1.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "billard1",
3
+ "comment": "",
4
+ "model": "paligemma-3b-mix-224",
5
+ "prompt": "How many red balls are there?",
6
+ "license": "CC0 by [mbosnjak@](https://github.com/mbosnjak)"
7
+ }
examples/billard2.jpg ADDED
examples/billard2.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "billard2",
3
+ "comment": "",
4
+ "model": "paligemma-3b-mix-224",
5
+ "prompt": "How many balls are there?",
6
+ "license": "CC0 by [mbosnjak@](https://github.com/mbosnjak)"
7
+ }
examples/bowie.jpg ADDED
examples/bowie.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "bowie",
3
+ "comment": "",
4
+ "model": "paligemma-3b-mix-224",
5
+ "prompt": "Who is this?",
6
+ "license": "CC0 by [akolesnikoff@](https://github.com/akolesnikoff)"
7
+ }
examples/branch.jpg ADDED
examples/branch.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "branch",
3
+ "comment": "",
4
+ "model": "paligemma-3b-mix-224",
5
+ "prompt": "What caused this?",
6
+ "license": "CC0 by [andsteing@](https://huggingface.co/andsteing)"
7
+ }
examples/cc_fox.jpg ADDED
examples/cc_fox.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "cc_fox",
3
+ "comment": "",
4
+ "model": "paligemma-3b-mix-448",
5
+ "prompt": "Which breed is this fox?",
6
+ "license": "CC0 by [XiaohuaZhai@](https://sites.google.com/view/xzhai)"
7
+ }
examples/cc_landscape.jpg ADDED
examples/cc_landscape.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "cc_landscape",
3
+ "comment": "",
4
+ "model": "paligemma-3b-mix-448",
5
+ "prompt": "What does the image show?",
6
+ "license": "CC0 by [XiaohuaZhai@](https://sites.google.com/view/xzhai)"
7
+ }
examples/cc_puffin.jpg ADDED
examples/cc_puffin.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "cc_puffin",
3
+ "comment": "",
4
+ "model": "paligemma-3b-mix-448",
5
+ "prompt": "detect puffin in the back; puffin in front",
6
+ "license": "CC0 by [XiaohuaZhai@](https://sites.google.com/view/xzhai)"
7
+ }
examples/couch.jpg ADDED
examples/couch.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "couch",
3
+ "comment": "",
4
+ "model": "paligemma-3b-mix-224",
5
+ "prompt": "How many yellow cushions are on the couch?",
6
+ "license": "CC0"
7
+ }
examples/couch_.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "couch",
3
+ "comment": "",
4
+ "model": "paligemma-3b-mix-224",
5
+ "prompt": "How many painting do you see in the image?",
6
+ "license": "CC0"
7
+ }
examples/cups.jpg ADDED
examples/cups.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "cups",
3
+ "comment": "",
4
+ "model": "paligemma-3b-mix-224",
5
+ "prompt": "how many cups?",
6
+ "license": "CC0 by [mbosnjak@](https://github.com/mbosnjak)"
7
+ }
examples/dice.jpg ADDED
examples/dice.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "dice",
3
+ "comment": "",
4
+ "model": "paligemma-3b-mix-224",
5
+ "prompt": "segment dice ; dice",
6
+ "license": "CC0 by [andresusanopinto@](https://github.com/andresusanopinto)"
7
+ }
examples/emu.jpg ADDED
examples/emu.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "emu",
3
+ "comment": "",
4
+ "model": "paligemma-3b-mix-224",
5
+ "prompt": "What animal is this?",
6
+ "license": "CC0 by [akolesnikoff@](https://github.com/akolesnikoff)"
7
+ }
examples/fridge.jpg ADDED
examples/fridge.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "fridge",
3
+ "comment": "",
4
+ "model": "paligemma-3b-mix-224",
5
+ "prompt": "Describe the image.",
6
+ "license": "CC0 by [andresusanopinto@](https://github.com/andresusanopinto)"
7
+ }
examples/givt.jpg ADDED
examples/givt.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "givt",
3
+ "comment": "",
4
+ "model": "paligemma-3b-mix-224",
5
+ "prompt": "What does the image show?",
6
+ "license": "CC-BY [GIVT paper](https://arxiv.org/abs/2312.02116)"
7
+ }
examples/greenlake.jpg ADDED
examples/greenlake.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "greenlake",
3
+ "comment": "",
4
+ "model": "paligemma-3b-mix-224",
5
+ "prompt": "Describe the image.",
6
+ "license": "CC0 by [akolesnikoff@](https://github.com/akolesnikoff)"
7
+ }
examples/howto.jpg ADDED
examples/howto.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "howto",
3
+ "comment": "",
4
+ "model": "paligemma-3b-mix-224",
5
+ "prompt": "What does this image show?",
6
+ "license": "CC-BY [How to train your ViT?](https://arxiv.org/abs/2106.10270)"
7
+ }
examples/markers.jpg ADDED
examples/markers.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "markers",
3
+ "comment": "answer en How many cups are there?",
4
+ "model": "paligemma-3b-mix-224",
5
+ "prompt": "How many cups are there?",
6
+ "license": "CC0"
7
+ }
examples/mcair.jpg ADDED
examples/mcair.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "mcair",
3
+ "comment": "",
4
+ "model": "paligemma-3b-mix-224",
5
+ "prompt": "Can you board this airplane?",
6
+ "license": "CC0 by [akolesnikoff@](https://github.com/akolesnikoff)"
7
+ }
examples/mcair_.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "mcair",
3
+ "comment": "",
4
+ "model": "paligemma-3b-mix-224",
5
+ "prompt": "Is this a restaurant?",
6
+ "license": "CC0 by [akolesnikoff@](https://github.com/akolesnikoff)"
7
+ }
examples/minergie.jpg ADDED
examples/minergie.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "minergie",
3
+ "comment": "",
4
+ "model": "paligemma-3b-mix-224",
5
+ "prompt": "ocr",
6
+ "license": "CC0 by [andsteing@](https://huggingface.co/andsteing)"
7
+ }
examples/morel.jpg ADDED
examples/morel.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "morel",
3
+ "comment": "",
4
+ "model": "paligemma-3b-mix-224",
5
+ "prompt": "detect morel",
6
+ "license": "CC0 by [andsteing@](https://huggingface.co/andsteing)"
7
+ }
examples/motorcyclists.jpg ADDED
examples/motorcyclists.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "motorcyclists",
3
+ "comment": "",
4
+ "model": "paligemma-3b-mix-224",
5
+ "prompt": "What does the image show?",
6
+ "license": "CC0 by [akolesnikoff@](https://github.com/akolesnikoff)"
7
+ }
examples/parking.jpg ADDED
examples/parking.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "parking",
3
+ "comment": "",
4
+ "model": "paligemma-3b-mix-224",
5
+ "prompt": "Describe the image.",
6
+ "license": "CC0 by [xiaohuazhai@](https://huggingface.co/xiaohuazhai)"
7
+ }