File size: 1,867 Bytes
d60d34b
acda6c7
 
d60d34b
acda6c7
ce86d1c
acda6c7
 
 
ce86d1c
acda6c7
3601eff
 
 
 
 
 
d60d34b
b9c781e
 
3601eff
d60d34b
 
 
3601eff
 
 
 
 
acda6c7
 
d60d34b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3601eff
d60d34b
 
 
 
 
 
 
3601eff
d60d34b
 
3601eff
 
d60d34b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import torch
import gradio as gr
from transformers import CLIPProcessor, CLIPModel
import spaces

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16").to("cuda")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")


@spaces.GPU(duration=120)
def calculate_score(image, text):
    labels = text.split(";")
    labels = [l.strip() for l in labels]
    labels = list(filter(None, labels))
    if len(labels) == 0:
        return dict()
    inputs = processor(text=labels, images=image, return_tensors="pt", padding=True)
    inputs = {
        k: v.to("cuda") for k, v in inputs.items()
    }  
    outputs = model(**inputs)
    logits_per_image = (
        outputs.logits_per_image.detach().cpu().numpy()
    )  # Move results back to CPU for further processing

    results_dict = {
        label: score / 100.0 for label, score in zip(labels, logits_per_image[0])
    }
    return results_dict


with gr.Blocks() as demo:
    gr.Markdown("# CLIP Score")
    gr.Markdown(
        "Calculate the [CLIP](https://openai.com/blog/clip/) score of a given image and text"
    )
    with gr.Row():
        image_input = gr.Image()
        output_label = gr.Label()

    text_input = gr.Textbox(label="Descriptions (separated by semicolons)")

    image_input.change(
        fn=calculate_score, inputs=[image_input, text_input], outputs=output_label
    )
    text_input.submit(
        fn=calculate_score, inputs=[image_input, text_input], outputs=output_label
    )

    gr.Examples(
        examples=[
            [
                "cat.jpg",
                "a cat stuck in a door; a cat in the air; a cat sitting; a cat standing; a cat is entering the matrix; a cat is entering the void",
            ]
        ],
        fn=calculate_score,
        inputs=[image_input, text_input],
        outputs=output_label,
    )

demo.launch()