import gradio as gr
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from PIL import Image

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("neulab/UIX-Qwen2")
model = AutoModelForSequenceClassification.from_pretrained("neulab/UIX-Qwen2")

# Function to process the screenshot and prompt
def predict_coordinates(screenshot, prompt):
    # Process the image and prompt here
    # For now, we'll use the prompt as input (actual screenshot integration needs proper pre-processing)
    
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model(**inputs)
    
    # Example response (fake coordinates for now)
    coordinates = {"x": 100, "y": 200}  # This would come from the model output

    return coordinates

# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("# UIX-Qwen2: Predict Coordinates for UI Interactions")
    
    with gr.Row():
        with gr.Column():
            screenshot = gr.Image(type="pil", label="Upload Screenshot")
            prompt = gr.Textbox(label="Prompt (e.g., 'Click on Submit button')")
        with gr.Column():
            output = gr.JSON(label="Predicted Coordinates (x, y)")
    
    submit_button = gr.Button("Get Coordinates")
    submit_button.click(predict_coordinates, inputs=[screenshot, prompt], outputs=output)

# Launch the Gradio app
demo.launch()