File size: 7,663 Bytes
515afcc
 
 
 
 
66b9ebf
 
515afcc
 
 
 
 
 
66b9ebf
515afcc
 
66b9ebf
515afcc
66b9ebf
 
515afcc
66b9ebf
 
 
 
515afcc
9313ffe
515afcc
 
 
 
 
 
66b9ebf
515afcc
 
 
 
 
 
 
66b9ebf
 
 
 
 
 
 
515afcc
 
 
 
 
 
 
 
 
66b9ebf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
515afcc
66b9ebf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
515afcc
a7914be
 
 
 
 
 
 
 
 
 
 
 
 
 
515afcc
66b9ebf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import gradio as gr
import open_clip
import torch

# Load model and tokenizer
DEVICE='cpu'
model, preprocess = open_clip.create_model_from_pretrained('hf-hub:woweenie/open-clip-vit-h-nsfw-finetune', device=DEVICE)
tokenizer = open_clip.get_tokenizer('hf-hub:woweenie/open-clip-vit-h-nsfw-finetune')

# Define labels
type_labels = ['2.5d render', '3d render', 'photograph', 'anime drawing', 'drawing', 'illustration', 'painting', 'pre-raphaelite painting', 'concept artwork', 'screenshot']
scene_labels = ['in an airport', 'in the bath', 'on a bed', 'in bed', 'in a bedroom', 'at the beach', 'on a boat', 'in a tent', 'in a car', 'on a chair', 'in the city', 'in a dressing room', 'on the floor', 'at the gym', 'in a hotel room', 'in a kitchen', 'in a living room', 'in an office', 'by a harbor', 'on a bench', 'in a park', 'by a piano', 'on a forest road', 'in a forest', 'in a garden', 'at a lake', 'on the grass', 'on the ground', 'on a paved surface', 'outdoors, on a rock', 'outdoors, on a rug', 'outdoors, on a towel', 'in a photo studio', 'at the pool', 'at a river', 'on a road', 'by the sea', 'showering', 'in the shower', 'on a stool', 'on a rug', 'on a rock', 'on a sofa', 'on a table', 'at a table', 'in a store', 'on snow', 'by a waterfall', 'with a water feature', 'on a windowsill']
expression_labels = ['scared', 'annoyed', 'aroused', 'bored', 'confident', 'distracted', 'dominating', 'embarrassed', 'scared', 'laughing', 'shy', 'orgasm']
clothing_labels = ['a bikini that is too small', 'bikini bottoms', 'a bikini top', 'a bikini', 'a bodysuit', 'a bra', 'a crop top', 'a dress', 'garters', 'glasses', 'goggles', 'gym shorts', 'a halter top', 'a hat', 'a handbra', 'a hoodie', 'a jacket', 'jeans', 'a jumper', 'a gown', 'a lace-up top', 'leggings', 'lingerie', 'a long sleeved top', 'a off-shoulder top', 'a nightgown', 'a coat', 'overalls', 'pink pajamas', 'pajamas', 'panties', 'pantyhose', 'a t-shirt', 'a robe', 'a bathrobe', 'a piece of fabric', 'a scarf', 'a shirt', 'shorts', 'a skirt', 'a sleeveless top', 'a slip', 'sneakers', 'tube socks', 'a sports bra', 'sunglasses', 'sweatpants', 'a one piece swimsuit', 'a t-shirt', 'a tank top', 'a tied shirt', 'a top', 'long pants', 'a wetsuit', 'a backpack', 'high hem', 'see-through', 'short', 'tight']
clothing_labels = ['wearing ' + label for label in clothing_labels]

def process_image_separate_tags(image):
    # Preprocess image
    image = preprocess(image).unsqueeze(0).to(DEVICE)

    # Tokenize labels
    type_text = tokenizer(type_labels).to(DEVICE)
    scene_text = tokenizer(scene_labels).to(DEVICE) 
    expression_text = tokenizer(expression_labels).to(DEVICE)
    clothing_text = tokenizer(clothing_labels).to(DEVICE)

    with torch.no_grad():
        # Encode image and text
        image_features = model.encode_image(image)
        type_text_features = model.encode_text(type_text)
        scene_text_features = model.encode_text(scene_text)
        expression_text_features = model.encode_text(expression_text)
        clothing_text_features = model.encode_text(clothing_text)

        # Normalize features
        image_features /= image_features.norm(dim=-1, keepdim=True)
        type_text_features /= type_text_features.norm(dim=-1, keepdim=True)
        scene_text_features /= scene_text_features.norm(dim=-1, keepdim=True)
        expression_text_features /= expression_text_features.norm(dim=-1, keepdim=True)
        clothing_text_features /= clothing_text_features.norm(dim=-1, keepdim=True)

        # Calculate cosine similarities and apply softmax
        # Using temperature parameter to control the "sharpness" of the distribution
        temperature = 0.1  # Lower values make the distribution more peaked
        type_text_probs = torch.softmax(image_features @ type_text_features.T / temperature, dim=-1)
        scene_text_probs = torch.softmax(image_features @ scene_text_features.T / temperature, dim=-1)
        expression_text_probs = torch.softmax(image_features @ expression_text_features.T / temperature, dim=-1)
        clothing_text_probs = torch.softmax(image_features @ clothing_text_features.T / temperature, dim=-1)

    # Convert to dictionaries
    type_results = {label: float(type_text_probs[0][i]) for i, label in enumerate(type_labels)}
    scene_results = {label: float(scene_text_probs[0][i]) for i, label in enumerate(scene_labels)}
    expression_results = {label: float(expression_text_probs[0][i]) for i, label in enumerate(expression_labels)}
    clothing_results = {label: float(clothing_text_probs[0][i]) for i, label in enumerate(clothing_labels)}

    return type_results, scene_results, expression_results, clothing_results

def process_image_combined_tags(image):
    # Preprocess image
    image = preprocess(image).unsqueeze(0).to(DEVICE)

    # Tokenize labels
    all_text = tokenizer(type_labels + scene_labels + expression_labels + clothing_labels).to(DEVICE)

    with torch.no_grad():
        # Encode image and text
        image_features = model.encode_image(image)
        all_text_features = model.encode_text(all_text)

        # Normalize features
        image_features /= image_features.norm(dim=-1, keepdim=True)
        all_text_features /= all_text_features.norm(dim=-1, keepdim=True)

        # Calculate cosine similarities and apply softmax
        # Using temperature parameter to control the "sharpness" of the distribution
        temperature = 0.1  # Lower values make the distribution more peaked
        cosine_similarities = image_features @ all_text_features.T
        all_text_probs = torch.softmax(cosine_similarities / temperature, dim=-1)

    # Convert to dictionaries
    all_results = {label: float(all_text_probs[0][i]) for i, label in enumerate(type_labels + scene_labels + expression_labels + clothing_labels)}

    return all_results

# Create Gradio interface
iface = gr.Blocks(title="NSFW Tagging with Finetuned CLIP")

with iface:
    gr.Markdown("# NSFW Tagging with Finetuned CLIP")
    gr.Markdown("Upload an image to analyze its content across multiple NSFW categories.")
    gr.Markdown("Uses [woweenie/open-clip-vit-h-nsfw-finetune](https://huggingface.co/woweenie/open-clip-vit-h-nsfw-finetune) finetuned on NSFW images.")
    gr.Markdown("Disclaimer: This model is not perfect and may make mistakes. Use at your own risk.")

    with gr.Tabs():
        with gr.Tab("Categorical Predictions"):
            with gr.Row():
                image_input2 = gr.Image(type="pil", label="Upload Image")
            with gr.Row():
                type_output = gr.Label(label="Predicted Type", num_top_classes=3)
                scene_output = gr.Label(label="Predicted Scene", num_top_classes=10) 
                expression_output = gr.Label(label="Predicted Expression", num_top_classes=3)
                clothing_output = gr.Label(label="Predicted Clothing", num_top_classes=5)
            predict_btn2 = gr.Button("Analyze")
            predict_btn2.click(
                fn=process_image_separate_tags,
                inputs=image_input2,
                outputs=[type_output, scene_output, expression_output, clothing_output]
            )

        with gr.Tab("Combined Predictions"):
            with gr.Row():
                image_input1 = gr.Image(type="pil", label="Upload Image")
            with gr.Row():
                combined_output = gr.Label(label="Predicted Tags", num_top_classes=10)
            predict_btn1 = gr.Button("Analyze")
            predict_btn1.click(
                fn=process_image_combined_tags,
                inputs=image_input1,
                outputs=combined_output
            )



if __name__ == "__main__":
    iface.launch()