File size: 2,279 Bytes
3efb08e 5ffad46 3efb08e 5ffad46 3efb08e 5ffad46 3efb08e 5ffad46 3efb08e 5ffad46 3efb08e 5ffad46 3efb08e 5ffad46 3efb08e 5ffad46 3efb08e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import torch
from PIL import Image
import numpy as np
import gradio as gr
# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("neulab/UIX-Qwen2")
model = AutoModel.from_pretrained("neulab/UIX-Qwen2")
# Function to preprocess the image (for simplicity, assume basic resizing)
def preprocess_image(image):
# Resize the image to the expected input size (placeholder, adjust for actual size needed by the model)
image = image.resize((224, 224)) # Example size
image = np.array(image).astype(np.float32) / 255.0 # Normalize to [0, 1]
image = torch.tensor(image).permute(2, 0, 1).unsqueeze(0) # Convert to tensor, add batch dim
return image
# Function to predict coordinates based on screenshot and prompt
def predict_coordinates(screenshot, prompt):
# Preprocess the image (screenshot)
image_tensor = preprocess_image(screenshot)
# Tokenize the prompt (text input)
inputs = tokenizer(prompt, return_tensors="pt")
# Assuming model accepts both image and text as input (adjust according to model's actual input requirement)
outputs = model(**inputs, pixel_values=image_tensor)
# The output could be logits or raw coordinates; we assume coordinates here (adjust based on model output)
coordinates = outputs.logits # Placeholder: adapt to actual model's coordinate prediction output
# Convert logits to coordinates (this is an example, adjust based on model's actual output format)
x, y = torch.argmax(coordinates, dim=-1).tolist() # Example conversion to (x, y)
return {"x": x, "y": y}
# Gradio Interface
with gr.Blocks() as demo:
gr.Markdown("# UIX-Qwen2: Predict Coordinates for UI Interactions")
with gr.Row():
with gr.Column():
screenshot = gr.Image(type="pil", label="Upload Screenshot")
prompt = gr.Textbox(label="Prompt (e.g., 'Click on Submit button')")
with gr.Column():
output = gr.JSON(label="Predicted Coordinates (x, y)")
submit_button = gr.Button("Get Coordinates")
submit_button.click(predict_coordinates, inputs=[screenshot, prompt], outputs=output)
# Launch the Gradio app
demo.launch() |