Spaces:

1inchcard
/

UIX-Qwen2

Runtime error

App Files Files Community

UIX-Qwen2 / app.py

1inchcard

Update app.py

3efb08e verified 9 days ago

raw

history blame contribute delete

2.28 kB

	from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
	import torch
	from PIL import Image
	import numpy as np
	import gradio as gr

	# Load the model and tokenizer
	tokenizer = AutoTokenizer.from_pretrained("neulab/UIX-Qwen2")
	model = AutoModel.from_pretrained("neulab/UIX-Qwen2")

	# Function to preprocess the image (for simplicity, assume basic resizing)
	def preprocess_image(image):
	# Resize the image to the expected input size (placeholder, adjust for actual size needed by the model)
	image = image.resize((224, 224)) # Example size
	image = np.array(image).astype(np.float32) / 255.0 # Normalize to [0, 1]
	image = torch.tensor(image).permute(2, 0, 1).unsqueeze(0) # Convert to tensor, add batch dim
	return image

	# Function to predict coordinates based on screenshot and prompt
	def predict_coordinates(screenshot, prompt):
	# Preprocess the image (screenshot)
	image_tensor = preprocess_image(screenshot)

	# Tokenize the prompt (text input)
	inputs = tokenizer(prompt, return_tensors="pt")

	# Assuming model accepts both image and text as input (adjust according to model's actual input requirement)
	outputs = model(**inputs, pixel_values=image_tensor)

	# The output could be logits or raw coordinates; we assume coordinates here (adjust based on model output)
	coordinates = outputs.logits # Placeholder: adapt to actual model's coordinate prediction output

	# Convert logits to coordinates (this is an example, adjust based on model's actual output format)
	x, y = torch.argmax(coordinates, dim=-1).tolist() # Example conversion to (x, y)

	return {"x": x, "y": y}

	# Gradio Interface
	with gr.Blocks() as demo:
	gr.Markdown("# UIX-Qwen2: Predict Coordinates for UI Interactions")

	with gr.Row():
	with gr.Column():
	screenshot = gr.Image(type="pil", label="Upload Screenshot")
	prompt = gr.Textbox(label="Prompt (e.g., 'Click on Submit button')")
	with gr.Column():
	output = gr.JSON(label="Predicted Coordinates (x, y)")

	submit_button = gr.Button("Get Coordinates")
	submit_button.click(predict_coordinates, inputs=[screenshot, prompt], outputs=output)

	# Launch the Gradio app
	demo.launch()