Spaces:

1inchcard
/

UIX-Qwen2

Runtime error

File size: 2,279 Bytes

3efb08e
 
5ffad46
3efb08e
 
5ffad46
3efb08e
5ffad46
3efb08e
5ffad46
3efb08e
 
 
 
 
 
 
 
 
5ffad46
3efb08e
 
5ffad46
3efb08e
5ffad46
 
3efb08e
 
 
 
 
 
 
 
 
 
5ffad46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3efb08e

from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import torch
from PIL import Image
import numpy as np
import gradio as gr

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("neulab/UIX-Qwen2")
model = AutoModel.from_pretrained("neulab/UIX-Qwen2")

# Function to preprocess the image (for simplicity, assume basic resizing)
def preprocess_image(image):
    # Resize the image to the expected input size (placeholder, adjust for actual size needed by the model)
    image = image.resize((224, 224))  # Example size
    image = np.array(image).astype(np.float32) / 255.0  # Normalize to [0, 1]
    image = torch.tensor(image).permute(2, 0, 1).unsqueeze(0)  # Convert to tensor, add batch dim
    return image

# Function to predict coordinates based on screenshot and prompt
def predict_coordinates(screenshot, prompt):
    # Preprocess the image (screenshot)
    image_tensor = preprocess_image(screenshot)
    
    # Tokenize the prompt (text input)
    inputs = tokenizer(prompt, return_tensors="pt")
    
    # Assuming model accepts both image and text as input (adjust according to model's actual input requirement)
    outputs = model(**inputs, pixel_values=image_tensor)
    
    # The output could be logits or raw coordinates; we assume coordinates here (adjust based on model output)
    coordinates = outputs.logits  # Placeholder: adapt to actual model's coordinate prediction output
    
    # Convert logits to coordinates (this is an example, adjust based on model's actual output format)
    x, y = torch.argmax(coordinates, dim=-1).tolist()  # Example conversion to (x, y)
    
    return {"x": x, "y": y}

# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("# UIX-Qwen2: Predict Coordinates for UI Interactions")
    
    with gr.Row():
        with gr.Column():
            screenshot = gr.Image(type="pil", label="Upload Screenshot")
            prompt = gr.Textbox(label="Prompt (e.g., 'Click on Submit button')")
        with gr.Column():
            output = gr.JSON(label="Predicted Coordinates (x, y)")
    
    submit_button = gr.Button("Get Coordinates")
    submit_button.click(predict_coordinates, inputs=[screenshot, prompt], outputs=output)

# Launch the Gradio app
demo.launch()