import gradio as gr from transformers import CLIPProcessor, CLIPModel from PIL import Image import torch import torch.nn.functional as F # Load the original CLIP model and processor model_original = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") processor_original = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") # Load the custom variants of the CLIP model model_variant_1 = CLIPModel.from_pretrained("biglab/uiclip_jitteredwebsites-2-224-paraphrased") model_variant_2 = CLIPModel.from_pretrained("biglab/uiclip_jitteredwebsites-2-224-paraphrased_webpairs") model_variant_3 = CLIPModel.from_pretrained("biglab/uiclip_jitteredwebsites-2-224-paraphrased_webpairs_humanpairs") # Define the function to process inputs and run inference with all four models def clip_predict_comparison(image1, image2, text): # Preprocess the image and text inputs using the original processor for all models inputs_1 = processor_original(text=[text], images=image1, return_tensors="pt") inputs_2 = processor_original(text=[text], images=image2, return_tensors="pt") def compute_similarity_with_softmax(model, inputs_1, inputs_2): # Compute similarity for image 1 with torch.no_grad(): outputs_image1 = model(**inputs_1) similarity_image1 = outputs_image1.logits_per_image.item() # Compute similarity for image 2 with torch.no_grad(): outputs_image2 = model(**inputs_2) similarity_image2 = outputs_image2.logits_per_image.item() # Apply softmax to normalize the scores between the two images similarities = torch.tensor([similarity_image1, similarity_image2]) normalized_scores = F.softmax(similarities, dim=0) result = f"Image 1: {normalized_scores[0].item():.4f}, Image 2: {normalized_scores[1].item():.4f}" return normalized_scores[0].item(), normalized_scores[1].item(), result # Compute similarities for all four models similarity_original_1, similarity_original_2, result_original = compute_similarity_with_softmax(model_original, inputs_1, inputs_2) similarity_variant_1_1, similarity_variant_1_2, result_variant_1 = compute_similarity_with_softmax(model_variant_1, inputs_1, inputs_2) similarity_variant_2_1, similarity_variant_2_2, result_variant_2 = compute_similarity_with_softmax(model_variant_2, inputs_1, inputs_2) similarity_variant_3_1, similarity_variant_3_2, result_variant_3 = compute_similarity_with_softmax(model_variant_3, inputs_1, inputs_2) # Return the normalized similarity scores from all models along with the comparison result return ( f"Original CLIP: {result_original}", f"UIClip: {result_variant_1}", f"UIClip + Webpairs: {result_variant_2}", f"UIClip + Webpairs + Humanpairs: {result_variant_3}" ) # Example inputs (paths to image files and corresponding text descriptions) examples = [ ["testcases/original.png", "testcases/bigtitle.png", "ui screenshot. well-designed. e-commerce shopping app"], ["testcases/original.png", "testcases/formaterror.png", "ui screenshot. well-designed. e-commerce shopping app"], ["testcases/original.png", "testcases/greybackground.png", "ui screenshot. well-designed. e-commerce shopping app"], ["testcases/wiki-original.png", "testcases/wiki-color.png", "ui screenshot. well-designed. page displaying information about neon"], ["testcases/wiki-original.png", "testcases/wiki-font.png", "ui screenshot. well-designed. page displaying information about neon"], ["testcases/wiki-original.png", "testcases/wiki-layout.png", "ui screenshot. well-designed. page displaying information about neon"], ] # Set up the Gradio interface demo = gr.Interface( fn=clip_predict_comparison, inputs=[ gr.Image(type="pil", label="Upload Image 1", height=400), # First image input with max height 400px gr.Image(type="pil", label="Upload Image 2", height=400), # Second image input with max height 400px gr.Textbox(label="Enter text description") # Text input ], outputs=[ gr.Textbox(label="OpenAI CLIP"), # Output for the original model gr.Textbox(label="UIClip"), # Output for variant 1 gr.Textbox(label="UIClip + Webpairs"), # Output for variant 2 gr.Textbox(label="UIClip + Webpairs + Humanpairs") # Output for variant 3 ], title="Score and Compare the Design Quality of two UI Screenshots", description="Upload two UI screenshots and provide a prompt in the format \"ui screenshot. well-designed. DESCRIPTION\". A generic description such as \"mobile app screen\" can also be used. The pair of screenshots are scored with CLIP and three variants of UIClip. The numbers in the output pane represent that probability (normalized via softmax) that one image is better designed than the other.", examples=examples # Include the example inputs ) # Launch the Gradio demo app demo.launch()