import gradio as gr
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import torch.nn.functional as F

# Load the original CLIP model and processor
model_original = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor_original = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Load the custom variants of the CLIP model
model_variant_1 = CLIPModel.from_pretrained("biglab/uiclip_jitteredwebsites-2-224-paraphrased")
model_variant_2 = CLIPModel.from_pretrained("biglab/uiclip_jitteredwebsites-2-224-paraphrased_webpairs")
model_variant_3 = CLIPModel.from_pretrained("biglab/uiclip_jitteredwebsites-2-224-paraphrased_webpairs_humanpairs")

# Define the function to process inputs and run inference with all four models
def clip_predict_comparison(image1, image2, text):
    # Preprocess the image and text inputs using the original processor for all models
    inputs_1 = processor_original(text=[text], images=image1, return_tensors="pt")
    inputs_2 = processor_original(text=[text], images=image2, return_tensors="pt")
    
    def compute_similarity_with_softmax(model, inputs_1, inputs_2):
        # Compute similarity for image 1
        with torch.no_grad():
            outputs_image1 = model(**inputs_1)
        similarity_image1 = outputs_image1.logits_per_image.item()

        # Compute similarity for image 2
        with torch.no_grad():
            outputs_image2 = model(**inputs_2)
        similarity_image2 = outputs_image2.logits_per_image.item()

        # Apply softmax to normalize the scores between the two images
        similarities = torch.tensor([similarity_image1, similarity_image2])
        normalized_scores = F.softmax(similarities, dim=0)

        result = f"Image 1: {normalized_scores[0].item():.4f}, Image 2: {normalized_scores[1].item():.4f}"

        return normalized_scores[0].item(), normalized_scores[1].item(), result

    # Compute similarities for all four models
    similarity_original_1, similarity_original_2, result_original = compute_similarity_with_softmax(model_original, inputs_1, inputs_2)
    similarity_variant_1_1, similarity_variant_1_2, result_variant_1 = compute_similarity_with_softmax(model_variant_1, inputs_1, inputs_2)
    similarity_variant_2_1, similarity_variant_2_2, result_variant_2 = compute_similarity_with_softmax(model_variant_2, inputs_1, inputs_2)
    similarity_variant_3_1, similarity_variant_3_2, result_variant_3 = compute_similarity_with_softmax(model_variant_3, inputs_1, inputs_2)

    # Return the normalized similarity scores from all models along with the comparison result
    return (
        f"Original CLIP: {result_original}",
        f"UIClip: {result_variant_1}",
        f"UIClip + Webpairs: {result_variant_2}",
        f"UIClip + Webpairs + Humanpairs: {result_variant_3}"
    )

# Example inputs (paths to image files and corresponding text descriptions)
examples = [
    ["testcases/original.png", "testcases/bigtitle.png", "ui screenshot. well-designed. e-commerce shopping app"],
    ["testcases/original.png", "testcases/formaterror.png", "ui screenshot. well-designed. e-commerce shopping app"],
    ["testcases/original.png", "testcases/greybackground.png", "ui screenshot. well-designed. e-commerce shopping app"],
    ["testcases/wiki-original.png", "testcases/wiki-color.png", "ui screenshot. well-designed. page displaying information about neon"],
    ["testcases/wiki-original.png", "testcases/wiki-font.png", "ui screenshot. well-designed. page displaying information about neon"],
    ["testcases/wiki-original.png", "testcases/wiki-layout.png", "ui screenshot. well-designed. page displaying information about neon"],    
]

# Set up the Gradio interface
demo = gr.Interface(
    fn=clip_predict_comparison,
    inputs=[
        gr.Image(type="pil", label="Upload Image 1", height=400),  # First image input with max height 400px
        gr.Image(type="pil", label="Upload Image 2", height=400),  # Second image input with max height 400px
        gr.Textbox(label="Enter text description")  # Text input
    ],
    outputs=[
        gr.Textbox(label="OpenAI CLIP"),  # Output for the original model
        gr.Textbox(label="UIClip"),  # Output for variant 1
        gr.Textbox(label="UIClip + Webpairs"),  # Output for variant 2
        gr.Textbox(label="UIClip + Webpairs + Humanpairs")  # Output for variant 3
    ],
    title="Score and Compare the Design Quality of two UI Screenshots",
    description="Upload two UI screenshots and provide a prompt in the format \"ui screenshot. well-designed. DESCRIPTION\". A generic description such as \"mobile app screen\" can also be used. The pair of screenshots are scored with CLIP and three variants of UIClip. The numbers in the output pane represent that probability (normalized via softmax) that one image is better designed than the other.",
    examples=examples  # Include the example inputs
)

# Launch the Gradio demo app
demo.launch()