|
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification |
|
import torch |
|
from PIL import Image |
|
import numpy as np |
|
import gradio as gr |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("neulab/UIX-Qwen2") |
|
model = AutoModel.from_pretrained("neulab/UIX-Qwen2") |
|
|
|
|
|
def preprocess_image(image): |
|
|
|
image = image.resize((224, 224)) |
|
image = np.array(image).astype(np.float32) / 255.0 |
|
image = torch.tensor(image).permute(2, 0, 1).unsqueeze(0) |
|
return image |
|
|
|
|
|
def predict_coordinates(screenshot, prompt): |
|
|
|
image_tensor = preprocess_image(screenshot) |
|
|
|
|
|
inputs = tokenizer(prompt, return_tensors="pt") |
|
|
|
|
|
outputs = model(**inputs, pixel_values=image_tensor) |
|
|
|
|
|
coordinates = outputs.logits |
|
|
|
|
|
x, y = torch.argmax(coordinates, dim=-1).tolist() |
|
|
|
return {"x": x, "y": y} |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# UIX-Qwen2: Predict Coordinates for UI Interactions") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
screenshot = gr.Image(type="pil", label="Upload Screenshot") |
|
prompt = gr.Textbox(label="Prompt (e.g., 'Click on Submit button')") |
|
with gr.Column(): |
|
output = gr.JSON(label="Predicted Coordinates (x, y)") |
|
|
|
submit_button = gr.Button("Get Coordinates") |
|
submit_button.click(predict_coordinates, inputs=[screenshot, prompt], outputs=output) |
|
|
|
|
|
demo.launch() |