File size: 2,618 Bytes
9c5a986
650c5f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9270dae
b667410
 
9270dae
b667410
 
 
 
 
 
 
867a29b
 
 
b667410
 
 
 
 
 
650c5f6
 
05ab3ae
0353d0c
45c4016
650c5f6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# https://huggingface.co/koajoel/PolyFormer
import os
import torch
import numpy as np
from fairseq import utils,tasks
from utils.checkpoint_utils import load_model_ensemble_and_task
from models.polyformer import PolyFormerModel
import cv2

import torch
import numpy as np
from fairseq import utils, tasks
from fairseq import checkpoint_utils
from utils.eval_utils import eval_step
from tasks.refcoco import RefcocoTask
from models.polyformer import PolyFormerModel
from PIL import Image
from torchvision import transforms
import cv2
import gradio as gr
import math
from io import BytesIO
import base64
import re
from demo import visual_grounding

title = "PolyFormer for Visual Grounding"

description = """<p style='text-align: center'> <a href='https://polyformer.github.io/' target='_blank'>Project Page</a> | <a href='https://arxiv.org/pdf/2302.07387.pdf' target='_blank'>Paper</a> | <a href='https://github.com/amazon-science/polygon-transformer' target='_blank'>Github Repo</a></p>
                 <p style='text-align: left'> Demo of PolyFormer for referring image segmentation and referring expression comprehension. Upload your own image or click any one of the examples, and write a description about a certain object. Then click \"Submit\" and wait for the results.</p>
<p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings.
<br/>
<a href="https://huggingface.co/spaces/koajoel/PolyFormer?duplicate=true">
<img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
</p>
"""

examples = [['demo/vases.jpg', 'the blue vase on the left'],
            ['demo/dog.jpg', 'the dog wearing glasses'],
            ['demo/bear.jpeg', 'a bear astronaut in the space'],
            ['demo/unicorn.jpeg', 'a unicorn doing computer vision research'],
            ['demo/pig.jpeg', 'a pig robot preparing a delicious meal'],
            ['demo/otta.png', 'a gentleman otter in a 19th century portrait'],
            ['demo/pikachu.jpeg', 'a pikachu fine-dining  with  a view  to  the  Eiffel Tower'],
            ['demo/cabin.jpeg', 'a small cabin on top of a snowy mountain in the style of Disney art station']
            ]
io = gr.Interface(fn=visual_grounding, inputs=[gr.inputs.Image(type='pil'), "textbox"],
                  outputs=[gr.outputs.Image(label="output", type='numpy'), gr.outputs.Image(label="predicted mask", type='numpy')],
                  title=title, description=description, examples=examples,
                  allow_flagging=False, allow_screenshot=False, cache_examples=False)
io.launch()