Spaces:

1inchcard
/

UIX-Qwen2

Runtime error

App Files Files Community

1inchcard commited on 9 days ago

Commit

3efb08e

•

1 Parent(s): 009ce0b

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -13

app.py CHANGED Viewed

@@ -1,23 +1,39 @@
-import gradio as gr
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from PIL import Image
-# Load the tokenizer and model
 tokenizer = AutoTokenizer.from_pretrained("neulab/UIX-Qwen2")
-model = AutoModelForSequenceClassification.from_pretrained("neulab/UIX-Qwen2")
-# Function to process the screenshot and prompt
 def predict_coordinates(screenshot, prompt):
-    # Process the image and prompt here
-    # For now, we'll use the prompt as input (actual screenshot integration needs proper pre-processing)
     inputs = tokenizer(prompt, return_tensors="pt")
-    outputs = model(**inputs)
-    # Example response (fake coordinates for now)
-    coordinates = {"x": 100, "y": 200}  # This would come from the model output
-    return coordinates
 # Gradio Interface
 with gr.Blocks() as demo:
@@ -34,4 +50,4 @@ with gr.Blocks() as demo:
     submit_button.click(predict_coordinates, inputs=[screenshot, prompt], outputs=output)
 # Launch the Gradio app
-demo.launch()

+from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
+import torch
 from PIL import Image
+import numpy as np
+import gradio as gr
+# Load the model and tokenizer
 tokenizer = AutoTokenizer.from_pretrained("neulab/UIX-Qwen2")
+model = AutoModel.from_pretrained("neulab/UIX-Qwen2")
+# Function to preprocess the image (for simplicity, assume basic resizing)
+def preprocess_image(image):
+    # Resize the image to the expected input size (placeholder, adjust for actual size needed by the model)
+    image = image.resize((224, 224))  # Example size
+    image = np.array(image).astype(np.float32) / 255.0  # Normalize to [0, 1]
+    image = torch.tensor(image).permute(2, 0, 1).unsqueeze(0)  # Convert to tensor, add batch dim
+    return image
+# Function to predict coordinates based on screenshot and prompt
 def predict_coordinates(screenshot, prompt):
+    # Preprocess the image (screenshot)
+    image_tensor = preprocess_image(screenshot)
+    # Tokenize the prompt (text input)
     inputs = tokenizer(prompt, return_tensors="pt")
+    # Assuming model accepts both image and text as input (adjust according to model's actual input requirement)
+    outputs = model(**inputs, pixel_values=image_tensor)
+    # The output could be logits or raw coordinates; we assume coordinates here (adjust based on model output)
+    coordinates = outputs.logits  # Placeholder: adapt to actual model's coordinate prediction output
+    # Convert logits to coordinates (this is an example, adjust based on model's actual output format)
+    x, y = torch.argmax(coordinates, dim=-1).tolist()  # Example conversion to (x, y)
+    return {"x": x, "y": y}
 # Gradio Interface
 with gr.Blocks() as demo:
     submit_button.click(predict_coordinates, inputs=[screenshot, prompt], outputs=output)
 # Launch the Gradio app
+demo.launch()