File size: 5,393 Bytes
b395c00
 
 
cbadb1a
b395c00
 
 
 
 
 
 
 
de2dda2
 
b395c00
 
 
 
de2dda2
 
b395c00
 
 
 
 
 
 
c2c8861
de2dda2
 
c2c8861
b395c00
 
7a72735
 
b395c00
7a72735
 
 
 
b395c00
7a72735
 
 
 
 
 
5d3c7d5
7a72735
 
 
 
 
52c29b8
b395c00
4056078
b395c00
 
 
 
 
1d036b5
23221e3
 
 
 
 
 
 
a2b1833
fe6ca74
b395c00
 
 
 
 
 
 
 
 
 
 
0647c43
b395c00
 
 
 
 
 
de2dda2
b395c00
0c97a49
b395c00
de2dda2
b395c00
c2c8861
b395c00
0c97a49
b395c00
 
 
c567393
b395c00
 
 
 
 
 
 
 
 
 
 
 
 
a298ea6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import gradio as gr
from transformers import AutoProcessor, AutoModelForCausalLM, BlipForQuestionAnswering, ViltForQuestionAnswering
import torch
import math

torch.hub.download_url_to_file('http://images.cocodataset.org/val2017/000000039769.jpg', 'cats.jpg')
torch.hub.download_url_to_file('https://huggingface.co/datasets/nielsr/textcaps-sample/resolve/main/stop_sign.png', 'stop_sign.png')
torch.hub.download_url_to_file('https://cdn.openai.com/dall-e-2/demos/text2im/astronaut/horse/photo/0.jpg', 'astronaut.jpg')

git_processor_base = AutoProcessor.from_pretrained("microsoft/git-base-vqav2")
git_model_base = AutoModelForCausalLM.from_pretrained("microsoft/git-base-vqav2")

# git_processor_large = AutoProcessor.from_pretrained("microsoft/git-large-vqav2")
# git_model_large = AutoModelForCausalLM.from_pretrained("microsoft/git-large-vqav2")

blip_processor_base = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
blip_model_base = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")

# blip_processor_large = AutoProcessor.from_pretrained("Salesforce/blip-vqa-capfilt-large")
# blip_model_large = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-capfilt-large")

vilt_processor = AutoProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
vilt_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

device = "cuda" if torch.cuda.is_available() else "cpu"

git_model_base.to(device)
# blip_model_base.to(device)
#git_model_large.to(device)
#blip_model_large.to(device)
# vilt_model.to(device)

def generate_answer_git(processor, model, image, question):
    # prepare image
    pixel_values = processor(images=image, return_tensors="pt").pixel_values

    # prepare question
    input_ids = processor(text=question, add_special_tokens=False).input_ids
    input_ids = [processor.tokenizer.cls_token_id] + input_ids
    input_ids = torch.tensor(input_ids).unsqueeze(0)
    
    generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50, return_dict_in_generate=True, output_scores=True)
    print('scores:')
    print(generated_ids.scores)
    idx = generated_ids.scores[0].argmax(-1).item()
    idx1 = generated_ids.scores[1].argmax(-1).item()
    print(idx, idx1)
    print(model.config.id2label)
    print('sequences:')
    print(generated_ids.sequences)
    print(generated_ids)
    generated_answer = processor.batch_decode(generated_ids.sequences, skip_special_tokens=True)
    print(generated_answer)
    
   
    return 'haha'


def generate_answer_blip(processor, model, image, question):
    # prepare image + question
    inputs = processor(images=image, text=question, return_tensors="pt")
    print('blip')

    
    inputs = processor(image, question, return_tensors="pt")
    out = model.generate(**inputs)
    print(processor.decode(out[0], skip_special_tokens=True))
    print(out)

    generated_ids = model.generate(**inputs, max_length=50, output_scores=True)
    print(generated_ids)
    generated_answer = processor.batch_decode(generated_ids, skip_special_tokens=True)
   
    return generated_answer


def generate_answer_vilt(processor, model, image, question):
    # prepare image + question
    encoding = processor(images=image, text=question, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**encoding)
    predicted_class_idx = outputs.logits.argmax(-1).item()
    return model.config.id2label[predicted_class_idx]


def generate_answers(image, question):
    answer_git_base = generate_answer_git(git_processor_base, git_model_base, image, question)

    # answer_git_large = generate_answer_git(git_processor_large, git_model_large, image, question)

    answer_blip_base = generate_answer_blip(blip_processor_base, blip_model_base, image, question)

    # answer_blip_large = generate_answer_blip(blip_processor_large, blip_model_large, image, question)

    # answer_vilt = generate_answer_vilt(vilt_processor, vilt_model, image, question)

    return answer_git_base, answer_blip_base

   
examples = [["cats.jpg", "How many cats are there?"], ["stop_sign.png", "What's behind the stop sign?"], ["astronaut.jpg", "What's the astronaut riding on?"]]
outputs = [gr.outputs.Textbox(label="Answer generated by GIT-base"), gr.outputs.Textbox(label="Answer generated by BLIP-base"), gr.outputs.Textbox(label="Answer generated by ViLT")] 

title = "Interactive demo: comparing visual question answering (VQA) models"
description = "Gradio Demo to compare GIT, BLIP and ViLT, 3 state-of-the-art vision+language models. To use it, simply upload your image and click 'submit', or click one of the examples to load them. Read more at the links below."
article = "<p style='text-align: center'><a href='https://huggingface.co/docs/transformers/main/model_doc/blip' target='_blank'>BLIP docs</a> | <a href='https://huggingface.co/docs/transformers/main/model_doc/git' target='_blank'>GIT docs</a></p>"

interface = gr.Interface(fn=generate_answers, 
                         inputs=[gr.inputs.Image(type="pil"), gr.inputs.Textbox(label="Question")],
                         outputs=outputs,
                         examples=examples, 
                         title=title,
                         description=description,
                         article=article, 
                         enable_queue=True)
interface.launch()