File size: 4,575 Bytes
8b891df
 
e45afa6
 
8b891df
e45afa6
 
 
8b891df
 
6b8803d
e45afa6
 
8b891df
 
 
 
 
 
 
e45afa6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b891df
 
 
 
 
 
 
 
 
 
 
e45afa6
 
 
8b891df
08c842e
e45afa6
8b891df
4b49d49
e45afa6
 
 
 
70e10bc
e45afa6
804947e
70e10bc
 
e45afa6
 
 
 
70e10bc
804947e
 
 
70e10bc
 
804947e
68d25a5
70e10bc
804947e
 
70e10bc
 
804947e
 
70e10bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e45afa6
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import gradio as gr
import torch
import os

from PIL import Image
from pathlib import Path
from more_itertools import chunked

from transformers import CLIPProcessor, CLIPModel

checkpoint = "vincentclaes/emoji-predictor"
x_, _, files = next(os.walk("./emojis"))
no_of_emojis = range(len(files))
emojis_as_images = [Image.open(f"emojis/{i}.png") for i in no_of_emojis]
K = 4

processor = CLIPProcessor.from_pretrained(checkpoint)
model = CLIPModel.from_pretrained(checkpoint)


def concat_images(*images):
    """Generate composite of all supplied images.
    https://stackoverflow.com/a/71315656/1771155
    """
    # Get the widest width.
    width = max(image.width for image in images)
    # Add up all the heights.
    height = max(image.height for image in images)
    # set the correct size of width and heigtht of composite.
    composite = Image.new('RGB', (2*width, 2*height))
    assert K == 4, "We expect 4 suggestions, other numbers won't work."
    for i, image in enumerate(images):
        if i == 0:
            composite.paste(image, (0, 0))
        elif i == 1:
            composite.paste(image, (width, 0))
        elif i == 2:
            composite.paste(image, (0, height))
        elif i == 3:
            composite.paste(image, (width, height))
    return composite


def get_emoji(text, model=model, processor=processor, emojis=emojis_as_images, K=4):
    inputs = processor(text=text, images=emojis, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)

    logits_per_text = outputs.logits_per_text
    # we take the softmax to get the label probabilities
    probs = logits_per_text.softmax(dim=1)
    # top K number of options
    predictions_suggestions_for_chunk = [torch.topk(prob, K).indices.tolist() for prob in probs][0]
    predictions_suggestions_for_chunk

    images = [Image.open(f"emojis/{i}.png") for i in predictions_suggestions_for_chunk]
    images_concat = concat_images(*images)
    return images_concat


text = gr.inputs.Textbox(placeholder="Enter a text and we will try to predict an emoji...")
title = "Predicting an Emoji"
description = """You provide a sentence and our few-shot fine tuned CLIP model will suggest 4 from the following emoji's:
\n❀️ 😍 πŸ˜‚ πŸ’• πŸ”₯ 😊 😎 ✨ πŸ’™ 😘 πŸ“· πŸ‡ΊπŸ‡Έ β˜€ πŸ’œ πŸ˜‰ πŸ’― 😁 πŸŽ„ πŸ“Έ 😜 ☹️ 😭 πŸ˜” 😑 πŸ’’ 😀 😳 πŸ™ƒ 😩 😠 πŸ™ˆ πŸ™„\n
"""
article = """
\n
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
\n
# Context
I fine tuned Open Ai's CLIP model on both text (tweets) and images of emoji's!\n
The current model you can play with is fine-tuned on 15 samples per emoji.

- model: https://huggingface.co/vincentclaes/emoji-predictor \n
- dataset: https://huggingface.co/datasets/vincentclaes/emoji-predictor \n
- profile: https://huggingface.co/vincentclaes \n

# Performance

Below you can find a table with the precision for predictions and suggestions 
for a range of samples per emoji we fine-tuned CLIP on.

### Prediction vs. Suggestion
- The column "Prediction" indicates the precision for predicting the right emoji.

- Since there can be some confusion about the right emoji for a tweet,
I also tried to present 4 suggestions. If 1 of the 4 suggestions is the same as the label,
I consider it a valid prediction. See the column "Suggestion".

- Randomly predicting an emoji would have a precision of 1/32 or 0.0325.
- Randomly suggesting an emoji would have a precision of 4/32 or 0.12.


           | Samples  | Prediction  | Suggestion  |
           |--------- |------------ |------------ |
           |    0     |    0.13     |    0.33     |
           |    1     |    0.11     |    0.30     |
           |    5     |    0.14     |    0.38     |
           |    10    |    0.20     |    0.45     |
           |    15    |    0.22     |    0.51     |
           |    20    |    0.19     |    0.49     |
           |    25    |    0.24     |    0.54     |
           |    50    |    0.23     |    0.53     |
           |   100    |    0.25     |    0.57     |
           |   250    |    0.29     |    0.62     |
           |   500    |    0.29     |    0.63     |
           
           


"""
examples = [
    "I'm so happy for you!",
    "I'm not feeling great today.",
    "This makes me angry!",
    "Can I follow you?",
    "I'm so bored right now ...",
]
gr.Interface(fn=get_emoji, inputs=text, outputs=gr.Image(shape=(72,72)), 
             examples=examples, title=title, description=description,
             article=article).launch()