import gradio as gr from transformers import AutoTokenizer, AutoModelForSequenceClassification from PIL import Image # Load the tokenizer and model tokenizer = AutoTokenizer.from_pretrained("neulab/UIX-Qwen2") model = AutoModelForSequenceClassification.from_pretrained("neulab/UIX-Qwen2") # Function to process the screenshot and prompt def predict_coordinates(screenshot, prompt): # Process the image and prompt here # For now, we'll use the prompt as input (actual screenshot integration needs proper pre-processing) inputs = tokenizer(prompt, return_tensors="pt") outputs = model(**inputs) # Example response (fake coordinates for now) coordinates = {"x": 100, "y": 200} # This would come from the model output return coordinates # Gradio Interface with gr.Blocks() as demo: gr.Markdown("# UIX-Qwen2: Predict Coordinates for UI Interactions") with gr.Row(): with gr.Column(): screenshot = gr.Image(type="pil", label="Upload Screenshot") prompt = gr.Textbox(label="Prompt (e.g., 'Click on Submit button')") with gr.Column(): output = gr.JSON(label="Predicted Coordinates (x, y)") submit_button = gr.Button("Get Coordinates") submit_button.click(predict_coordinates, inputs=[screenshot, prompt], outputs=output) # Launch the Gradio app demo.launch()