Spaces:

microsoft
/

OmniParser

Running on Zero

App Files Files Community

OmniParser / app.py

adamlu1

minor

414afd9 27 days ago

raw

history blame contribute delete

4.64 kB

	from typing import Optional
	import spaces

	import gradio as gr
	import numpy as np
	import torch
	from PIL import Image
	import io


	import base64, os
	from utils import check_ocr_box, get_yolo_model, get_caption_model_processor, get_som_labeled_img
	import torch
	from PIL import Image

	# yolo_model = get_yolo_model(model_path='weights/icon_detect/best.pt')
	# caption_model_processor = get_caption_model_processor(model_name="florence2", model_name_or_path="weights/icon_caption_florence")

	from ultralytics import YOLO
	yolo_model = YOLO('weights/icon_detect/best.pt').to('cuda')
	from transformers import AutoProcessor, AutoModelForCausalLM
	processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
	model = AutoModelForCausalLM.from_pretrained("weights/icon_caption_florence", torch_dtype=torch.float16, trust_remote_code=True).to('cuda')
	caption_model_processor = {'processor': processor, 'model': model}
	print('finish loading model!!!')


	MARKDOWN = """
	# OmniParser for Pure Vision Based General GUI Agent 🔥
	<div>
	<a href="https://arxiv.org/pdf/2408.00203">
	<img src="https://img.shields.io/badge/arXiv-2408.00203-b31b1b.svg" alt="Arxiv" style="display:inline-block;">
	</a>
	</div>

	OmniParser is a screen parsing tool to convert general GUI screen to structured elements.

	📢 [[Project Page](https://microsoft.github.io/OmniParser/)] [[Blog Post](https://www.microsoft.com/en-us/research/articles/omniparser-for-pure-vision-based-gui-agent/)] [[Models](https://huggingface.co/microsoft/OmniParser)]
	"""

	# DEVICE = torch.device('cuda')

	@spaces.GPU
	@torch.inference_mode()
	# @torch.autocast(device_type="cuda", dtype=torch.bfloat16)
	# @spaces.GPU(duration=65)
	def process(
	image_input,
	box_threshold,
	iou_threshold
	) -> Optional[Image.Image]:

	image_save_path = 'imgs/saved_image_demo.png'
	image_input.save(image_save_path)
	# import pdb; pdb.set_trace()
	image = Image.open(image_save_path)
	box_overlay_ratio = image.size[0] / 3200
	draw_bbox_config = {
	'text_scale': 0.8 * box_overlay_ratio,
	'text_thickness': max(int(2 * box_overlay_ratio), 1),
	'text_padding': max(int(3 * box_overlay_ratio), 1),
	'thickness': max(int(3 * box_overlay_ratio), 1),
	}

	ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_save_path, display_img = False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold':0.9}, use_paddleocr=True)
	text, ocr_bbox = ocr_bbox_rslt
	# print('prompt:', prompt)
	dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_save_path, yolo_model, BOX_TRESHOLD = box_threshold, output_coord_in_ratio=True, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text,iou_threshold=iou_threshold)
	image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img)))
	print('finish processing')
	parsed_content_list = '\n'.join(parsed_content_list)
	return image, str(parsed_content_list), str(label_coordinates)



	with gr.Blocks() as demo:
	gr.Markdown(MARKDOWN)
	with gr.Row():
	with gr.Column():
	image_input_component = gr.Image(
	type='pil', label='Upload image')
	# set the threshold for removing the bounding boxes with low confidence, default is 0.05
	box_threshold_component = gr.Slider(
	label='Box Threshold', minimum=0.01, maximum=1.0, step=0.01, value=0.05)
	# set the threshold for removing the bounding boxes with large overlap, default is 0.1
	iou_threshold_component = gr.Slider(
	label='IOU Threshold', minimum=0.01, maximum=1.0, step=0.01, value=0.1)
	submit_button_component = gr.Button(
	value='Submit', variant='primary')
	with gr.Column():
	image_output_component = gr.Image(type='pil', label='Image Output')
	text_output_component = gr.Textbox(label='Parsed screen elements', placeholder='Text Output')
	coordinates_output_component = gr.Textbox(label='Coordinates', placeholder='Coordinates Output')

	submit_button_component.click(
	fn=process,
	inputs=[
	image_input_component,
	box_threshold_component,
	iou_threshold_component
	],
	outputs=[image_output_component, text_output_component, coordinates_output_component]
	)

	# demo.launch(debug=False, show_error=True, share=True)
	# demo.launch(share=True, server_port=7861, server_name='0.0.0.0')
	demo.queue().launch(share=False)