File size: 2,279 Bytes
3efb08e
 
5ffad46
3efb08e
 
5ffad46
3efb08e
5ffad46
3efb08e
5ffad46
3efb08e
 
 
 
 
 
 
 
 
5ffad46
3efb08e
 
5ffad46
3efb08e
5ffad46
 
3efb08e
 
 
 
 
 
 
 
 
 
5ffad46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3efb08e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import torch
from PIL import Image
import numpy as np
import gradio as gr

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("neulab/UIX-Qwen2")
model = AutoModel.from_pretrained("neulab/UIX-Qwen2")

# Function to preprocess the image (for simplicity, assume basic resizing)
def preprocess_image(image):
    # Resize the image to the expected input size (placeholder, adjust for actual size needed by the model)
    image = image.resize((224, 224))  # Example size
    image = np.array(image).astype(np.float32) / 255.0  # Normalize to [0, 1]
    image = torch.tensor(image).permute(2, 0, 1).unsqueeze(0)  # Convert to tensor, add batch dim
    return image

# Function to predict coordinates based on screenshot and prompt
def predict_coordinates(screenshot, prompt):
    # Preprocess the image (screenshot)
    image_tensor = preprocess_image(screenshot)
    
    # Tokenize the prompt (text input)
    inputs = tokenizer(prompt, return_tensors="pt")
    
    # Assuming model accepts both image and text as input (adjust according to model's actual input requirement)
    outputs = model(**inputs, pixel_values=image_tensor)
    
    # The output could be logits or raw coordinates; we assume coordinates here (adjust based on model output)
    coordinates = outputs.logits  # Placeholder: adapt to actual model's coordinate prediction output
    
    # Convert logits to coordinates (this is an example, adjust based on model's actual output format)
    x, y = torch.argmax(coordinates, dim=-1).tolist()  # Example conversion to (x, y)
    
    return {"x": x, "y": y}

# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("# UIX-Qwen2: Predict Coordinates for UI Interactions")
    
    with gr.Row():
        with gr.Column():
            screenshot = gr.Image(type="pil", label="Upload Screenshot")
            prompt = gr.Textbox(label="Prompt (e.g., 'Click on Submit button')")
        with gr.Column():
            output = gr.JSON(label="Predicted Coordinates (x, y)")
    
    submit_button = gr.Button("Get Coordinates")
    submit_button.click(predict_coordinates, inputs=[screenshot, prompt], outputs=output)

# Launch the Gradio app
demo.launch()